ParserController.scala 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. package com.weEat.controllers
  2. import com.weEat.shared.models._
  3. import javax.inject.{Inject,Singleton}
  4. import play.api.libs.json._
  5. import play.api.mvc._
  6. import scala.concurrent.Future
  7. import com.weEat.models.Authorization
  8. import scalaoauth2.provider.{AuthInfoRequest,OAuth2ProviderActionBuilders}
  9. import com.weEat.services.OAuth2Service
  10. import net.ruippeixotog.scalascraper.browser.JsoupBrowser
  11. import net.ruippeixotog.scalascraper.dsl.DSL._
  12. import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
  13. //import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
  14. import net.ruippeixotog.scalascraper.model.Element
  15. import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
  16. import scala.util._
  17. @Singleton
  18. class ParserController @Inject()(
  19. val controllerComponents: ControllerComponents,
  20. oauth: OAuth2Service,
  21. usdaController: USDAController,
  22. foodController: FoodController
  23. ) extends BaseController
  24. with OAuth2ProviderActionBuilders {
  25. implicit val ec = scala.concurrent.ExecutionContext.global
  26. private val _browser = JsoupBrowser()
  27. def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
  28. val url = request.body
  29. _findParser(url).fold(Future.successful(NotFound(s"No parser available for $url."))) { (parser) =>
  30. val doc = _browser.get(url)
  31. val title = doc >> parser.titleExtractor
  32. val servings = doc >> parser.servingExtractor
  33. val prepTime = parser.prepTimeExtractor.flatMap(doc >?> _)
  34. val cookTime = parser.cookTimeExtractor.flatMap(doc >?> _)
  35. val ingredients = doc >> parser.ingredientExtractor
  36. val instructions = doc >> parser.instructionExtractor
  37. Future.sequence(ingredients.map(_parseIngredient _))
  38. .map((ingredients) => Ok(Json.toJson(RecipeNodeNoId(
  39. title,
  40. servings.getOrElse(1.0f),
  41. 1.0f,
  42. UnitType.NUMBER,
  43. ingredients.toSeq,
  44. /* tflucke@[2023-10-26]: Do not pss along the instructions since this
  45. * could be a violation of the Recipe Author's copyright. */
  46. Nil, //instructions.toSeq,
  47. None,
  48. None,
  49. Some(url)
  50. ))))
  51. }
  52. })
  53. private def _findParser(url: String): Option[Parser] = {
  54. val host = new java.net.URL(url).getAuthority()
  55. val hostNoWWW = if (host.startsWith("www.")) host.substring("www.".length) else host
  56. Map(
  57. ("epicurious.com" -> Parser.epicurious),
  58. ("mccormick.com" -> Parser.mccormick),
  59. ("recipetineats.com" -> Parser.recipeTinEats),
  60. ("mamalovestocook.com" -> Parser.recipeTinEats)
  61. ).get(hostNoWWW)
  62. }
  63. private def _parseIngredient(ingredientLine: String): Future[Ingredient] = {
  64. val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  65. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  66. ingredientLine match {
  67. case numberPattern(amount, unit, rest) =>
  68. _guessFoodFromStr(rest).map(Ingredient(
  69. _,
  70. amount.toFloat,
  71. MeasureUnit.guessUnit(unit).getOrElse(Count)
  72. ))
  73. case fractionPattern(numerator, denominator, unit, rest) =>
  74. _guessFoodFromStr(rest).map(Ingredient(
  75. _,
  76. numerator.toFloat/denominator.toFloat,
  77. MeasureUnit.guessUnit(unit).getOrElse(Count)
  78. ))
  79. case noUnitLine => _guessFoodFromStr(noUnitLine).map(Ingredient(_, 1, Count))
  80. }
  81. }
  82. private def _guessFoodFromStr(foodLine: String): Future[Ingredient.IngredientId] = {
  83. import gov.usda.nal.fdc.models.DataType._
  84. usdaController.fdc.getFoodsSearch(foodLine.filterNot(_ == '/'), Seq(
  85. Foundation, Survey, SRLegacy
  86. ), pageSize = Some(10))().flatMap({ (fdcResult) =>
  87. Future.sequence(
  88. fdcResult.foods.map((food) => foodController.getByFdcId(food.fdcId))
  89. ).map(_.flatten
  90. .headOption
  91. .fold[Ingredient.IngredientId](
  92. Ingredient.USDAId(fdcResult.foods.head.fdcId)
  93. )((foodNode) => Ingredient.FoodNodeId(foodNode._id))
  94. ).transform({
  95. case Success(x) => Success(x)
  96. case Failure(x) => println(foodLine);Failure(x)
  97. })
  98. })
  99. }
  100. }
  101. case class Parser(
  102. titleExtractor: HtmlExtractor[Element, String],
  103. servingExtractor: HtmlExtractor[Element, Option[Float]],
  104. prepTimeExtractor: Option[HtmlExtractor[Element, String]],
  105. cookTimeExtractor: Option[HtmlExtractor[Element, String]],
  106. ingredientExtractor: HtmlExtractor[Element, Iterable[String]],
  107. instructionExtractor: HtmlExtractor[Element, Iterable[String]],
  108. )
  109. object Parser {
  110. val mccormick = Parser(
  111. text("h1"),
  112. // TODO use extractors
  113. text(".main-title .count").map(_.toFloatOption),
  114. Some(text(".prep_time .first_content")),
  115. cookTimeExtractor = Some(text(".ingredients .first_content")),
  116. ingredientExtractor = texts(".recipe-about-list li"),
  117. texts(".instructions-main span.para")
  118. )
  119. val epicurious = Parser(
  120. text("h1"),
  121. text("""div[data-testid="IngredientList"] > p""")
  122. .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  123. None,
  124. None,
  125. texts("""div[data-testid="IngredientList"] > div > div"""),
  126. texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
  127. )
  128. val recipeTinEats = Parser(
  129. text("h2.wprm-recipe-name"),
  130. text("span.wprm-recipe-servings").map(_.toFloatOption),
  131. Some(text("span.wprm-recipe-prep_time-minutes")),
  132. Some(text("span.wprm-recipe-cook_time-minutes")),
  133. texts("li.wprm-recipe-ingredient")
  134. .map(_.map(_
  135. .replaceAll("\u00BD", "1/2")
  136. .replaceAll("\u00BC", "1/4")
  137. .replaceAll("\u00BE", "3/4")
  138. .replaceAll("\u2150", "1/7")
  139. .replaceAll("\u2151", "1/9")
  140. .replaceAll("\u2152", "1/10")
  141. .replaceAll("\u2153", "1/3")
  142. .replaceAll("\u2154", "2/3")
  143. .replaceAll("\u2155", "1/5")
  144. .replaceAll("\u2156", "2/5")
  145. .replaceAll("\u2157", "3/5")
  146. .replaceAll("\u2158", "4/5")
  147. .replaceAll("\u2159", "1/6")
  148. .replaceAll("\u215A", "5/6")
  149. .replaceAll("\u215B", "1/8")
  150. .replaceAll("\u215C", "3/8")
  151. .replaceAll("\u215D", "5/8")
  152. .replaceAll("\u215E", "7/8")
  153. .replaceAll("\u215F", "1/")
  154. .filter(_ <= 0x7f)
  155. .trim
  156. )),
  157. texts("div.wprm-recipe-instruction-text")
  158. )
  159. }