ParserController.scala 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. package com.weEat.controllers
  2. import com.weEat.shared.models._
  3. import javax.inject.{Inject,Singleton}
  4. import play.api.libs.json._
  5. import play.api.mvc._
  6. import scala.concurrent.Future
  7. import com.weEat.models.Authorization
  8. import scalaoauth2.provider.{AuthInfoRequest,OAuth2ProviderActionBuilders}
  9. import com.weEat.services.OAuth2Service
  10. import net.ruippeixotog.scalascraper.browser.JsoupBrowser
  11. import net.ruippeixotog.scalascraper.dsl.DSL._
  12. import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
  13. import net.ruippeixotog.scalascraper.model.Element
  14. import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
  15. import scala.util._
  16. @Singleton
  17. class ParserController @Inject()(
  18. val controllerComponents: ControllerComponents,
  19. oauth: OAuth2Service,
  20. usdaController: USDAController,
  21. foodController: FoodController
  22. ) extends BaseController
  23. with OAuth2ProviderActionBuilders {
  24. implicit val ec = scala.concurrent.ExecutionContext.global
  25. private val _browser = JsoupBrowser()
  26. def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
  27. val url = request.body
  28. _findParser(url).fold(Future.successful(NotFound(s"No parser available for $url."))) { (parser) =>
  29. val doc = _browser.get(url)
  30. val title = doc >> parser.titleExtractor
  31. val servings = doc >> parser.servingExtractor
  32. val prepTime = parser.prepTimeExtractor.map(doc >> _)
  33. val cookTime = parser.cookTimeExtractor.map(doc >> _)
  34. val ingredients = doc >> parser.ingredientExtractor
  35. val instructions = doc >> parser.instructionExtractor
  36. Future.sequence(ingredients.map(_parseIngredient _))
  37. .map((ingredients) => Ok(Json.toJson(RecipeNodeNoId(
  38. title,
  39. servings.getOrElse(1.0f),
  40. 1.0f,
  41. UnitType.NUMBER,
  42. ingredients.toSeq,
  43. /* tflucke@[2023-10-26]: Do not pss along the instructions since this
  44. * could be a violation of the Recipe Author's copyright. */
  45. Nil, //instructions.toSeq,
  46. None,
  47. None,
  48. Some(url)
  49. ))))
  50. }
  51. })
  52. private def _findParser(url: String): Option[Parser] = {
  53. val host = new java.net.URL(url).getAuthority()
  54. val hostNoWWW = if (host.startsWith("www.")) host.substring("www.".length) else host
  55. Map(
  56. ("epicurious.com" -> Parser.epicurious),
  57. ("mccormick.com" -> Parser.mccormick),
  58. ("recipetineats.com" -> Parser.recipeTinEats)
  59. ).get(hostNoWWW)
  60. }
  61. private def _parseIngredient(ingredientLine: String): Future[Ingredient] = {
  62. val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  63. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  64. //println(ingredientLine)
  65. ingredientLine match {
  66. case numberPattern(amount, unit, rest) =>
  67. _guessFoodFromStr(rest).map(Ingredient(_, amount.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count)))
  68. case fractionPattern(numerator, denominator, unit, rest) =>
  69. _guessFoodFromStr(rest).map(Ingredient(_, numerator.toFloat/denominator.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count)))
  70. case noUnitLine => _guessFoodFromStr(noUnitLine).map(Ingredient(_, 1, Count))
  71. }
  72. }
  73. private def _guessFoodFromStr(foodLine: String): Future[Ingredient.IngredientId] = {
  74. import gov.usda.nal.fdc.models.DataType._
  75. usdaController.fdc.getFoodsSearch(foodLine, Seq(
  76. Foundation, Survey, SRLegacy
  77. ), pageSize = Some(10))().flatMap({ (fdcResult) =>
  78. Future.sequence(fdcResult.foods.map((food) => foodController.getByFdcId(food.fdcId)))
  79. .map(_.flatten
  80. .headOption
  81. .fold[Ingredient.IngredientId](Ingredient.USDAId(fdcResult.foods.head.fdcId))((foodNode) => Ingredient.FoodNodeId(foodNode._id))
  82. ).transform({
  83. case Success(x) => Success(x)
  84. case Failure(x) => println(foodLine);Failure(x)
  85. })
  86. })
  87. }
  88. }
  89. case class Parser(
  90. titleExtractor: HtmlExtractor[Element, String],
  91. servingExtractor: HtmlExtractor[Element, Option[Float]],
  92. prepTimeExtractor: Option[HtmlExtractor[Element, String]],
  93. cookTimeExtractor: Option[HtmlExtractor[Element, String]],
  94. ingredientExtractor: HtmlExtractor[Element, Iterable[String]],
  95. instructionExtractor: HtmlExtractor[Element, Iterable[String]],
  96. )
  97. object Parser {
  98. val mccormick = Parser(
  99. text("h1"),
  100. text(".main-title .count").map(_.toFloatOption),
  101. Some(text(".prep_time .first_content")),
  102. cookTimeExtractor = Some(text(".ingredients .first_content")),
  103. ingredientExtractor = texts(".recipe-about-list li"),
  104. texts(".instructions-main span.para")
  105. )
  106. val epicurious = Parser(
  107. text("h1"),
  108. text("""div[data-testid="IngredientList"] > p""").map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  109. None,
  110. None,
  111. texts("""div[data-testid="IngredientList"] > div > div"""),
  112. texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
  113. )
  114. val recipeTinEats = Parser(
  115. text("h2.wprm-recipe-name"),
  116. text("span.wprm-recipe-servings").map(_.toFloatOption),
  117. Some(text("span.wprm-recipe-prep_time-minutes")),
  118. Some(text("span.wprm-recipe-cook_time-minutes")),
  119. texts("li.wprm-recipe-ingredient").map(_.map(_.filter(_ <= 0x7f).filterNot(_ == '/').trim)),
  120. texts("div.wprm-recipe-instruction-text")
  121. )
  122. }