ParserController.scala 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. package com.weEat.controllers
  2. import com.weEat.shared.models._
  3. import javax.inject.{Inject,Singleton}
  4. import play.api.libs.json._
  5. import play.api.mvc._
  6. import scala.concurrent.Future
  7. import com.weEat.models.Authorization
  8. import scalaoauth2.provider.{AuthInfoRequest,OAuth2ProviderActionBuilders}
  9. import com.weEat.services.OAuth2Service
  10. import net.ruippeixotog.scalascraper.browser.JsoupBrowser
  11. import net.ruippeixotog.scalascraper.dsl.DSL._
  12. import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
  13. //import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
  14. import net.ruippeixotog.scalascraper.model.Element
  15. import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
  16. import scala.util._
  17. @Singleton
  18. class ParserController @Inject()(
  19. val controllerComponents: ControllerComponents,
  20. oauth: OAuth2Service,
  21. usdaController: USDAController,
  22. foodController: FoodController
  23. ) extends BaseController
  24. with OAuth2ProviderActionBuilders {
  25. implicit val ec = scala.concurrent.ExecutionContext.global
  26. private val _browser = JsoupBrowser()
  27. def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
  28. val url = request.body
  29. _findParser(url).fold(
  30. Future.successful(NotFound(s"No parser available for $url."))
  31. ) { (parser) =>
  32. val doc = _browser.get(url)
  33. val title = doc >> parser.titleExtractor
  34. val servings = doc >> parser.servingExtractor
  35. val prepTime = parser.prepTimeExtractor.flatMap(doc >?> _)
  36. val cookTime = parser.cookTimeExtractor.flatMap(doc >?> _)
  37. val ingredients = doc >> parser.ingredientExtractor
  38. val instructions = doc >> parser.instructionExtractor
  39. Future.sequence(ingredients.map({
  40. case (amt, u, line) => _guessFoodFromStr(line).map(Ingredient(_, amt, u))
  41. }))
  42. .map((ingredients) => Ok(Json.toJson(RecipeNodeNoId(
  43. title,
  44. servings.getOrElse(1.0f),
  45. 1.0f,
  46. UnitType.NUMBER,
  47. ingredients.toSeq,
  48. /* tflucke@[2023-10-26]: Do not pss along the instructions since this
  49. * could be a violation of the Recipe Author's copyright. */
  50. Nil, //instructions.toSeq,
  51. None,
  52. None,
  53. Some(url),
  54. None
  55. ))))
  56. }
  57. })
  58. private def _findParser(url: String): Option[Parser] = {
  59. val host = new java.net.URL(url).getAuthority()
  60. val hostNoWWW =
  61. if (host.startsWith("www.")) host.substring("www.".length) else host
  62. Map(
  63. ("epicurious.com" -> Parser.epicurious),
  64. ("mccormick.com" -> Parser.mccormick),
  65. ("recipetineats.com" -> Parser.recipeTinEats),
  66. ("mamalovestocook.com" -> Parser.recipeTinEats),
  67. ("soulfullymade.com" -> Parser.recipeTinEats),
  68. ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
  69. ("darngoodveggies.com" -> Parser.tastyRecipes),
  70. ("seriouseats.com" -> Parser.seriousEats),
  71. ("greatist.com" -> Parser.greatist),
  72. ("dimitrasdishes.com" -> Parser.dimitrasDishes)
  73. ).get(hostNoWWW)
  74. }
  75. private def _guessFoodFromStr(
  76. foodLine: String
  77. ): Future[Ingredient.IngredientId] = {
  78. import gov.usda.nal.fdc.models.DataType._
  79. val foodLineFiltered = foodLine
  80. .filter(_ <= 0x7f)
  81. .filterNot(_ == ':')
  82. .filterNot(_ == '/')
  83. usdaController.fdc.getFoodsSearch(foodLineFiltered, Seq(
  84. // Branded,
  85. Foundation, SRLegacy
  86. ), pageSize = Some(10))().flatMap({ (fdcResult) =>
  87. Future.sequence(
  88. fdcResult.foods.map((food) => foodController.getByFdcId(food.fdcId))
  89. ).map(_.flatten
  90. .headOption
  91. .fold[Ingredient.IngredientId](
  92. Ingredient.USDAId(fdcResult.foods.head.fdcId)
  93. )((foodNode) => Ingredient.FoodNodeId(foodNode._id))
  94. ).transform({
  95. case Success(x) => Success(x)
  96. case Failure(x) => println(s"Food lookup failed: $x");Failure(x)
  97. })
  98. })
  99. }
  100. }
  101. case class Parser(
  102. titleExtractor: HtmlExtractor[Element, String],
  103. servingExtractor: HtmlExtractor[Element, Option[Float]],
  104. prepTimeExtractor: Option[HtmlExtractor[Element, String]],
  105. cookTimeExtractor: Option[HtmlExtractor[Element, String]],
  106. ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
  107. instructionExtractor: HtmlExtractor[Element, Iterable[String]],
  108. )
  109. object Parser {
  110. val mccormick = Parser(
  111. text("h1"),
  112. // TODO use extractors
  113. text(".main-title .count").map(_.toFloatOption),
  114. Some(text(".prep_time .first_content")),
  115. cookTimeExtractor = Some(text(".ingredients .first_content")),
  116. ingredientExtractor = texts(".recipe-about-list li").map(
  117. _.map(_parseIngredient _)
  118. ),
  119. texts(".instructions-main span.para")
  120. )
  121. val epicurious = Parser(
  122. text("h1"),
  123. text("""div[data-testid="IngredientList"] > p""")
  124. .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  125. None,
  126. None,
  127. texts("""div[data-testid="IngredientList"] > div > div""").map(
  128. _.map(_parseIngredient _)
  129. ),
  130. texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
  131. )
  132. val recipeTinEats = Parser(
  133. text("h2.wprm-recipe-name"),
  134. text("span.wprm-recipe-servings").map(_.toFloatOption),
  135. Some(text("span.wprm-recipe-prep_time-minutes")),
  136. Some(text("span.wprm-recipe-cook_time-minutes")),
  137. texts("li.wprm-recipe-ingredient")
  138. .map(_.map(_
  139. .replaceAll("\u00BD", "1/2")
  140. .replaceAll("\u00BC", "1/4")
  141. .replaceAll("\u00BE", "3/4")
  142. .replaceAll("\u2150", "1/7")
  143. .replaceAll("\u2151", "1/9")
  144. .replaceAll("\u2152", "1/10")
  145. .replaceAll("\u2153", "1/3")
  146. .replaceAll("\u2154", "2/3")
  147. .replaceAll("\u2155", "1/5")
  148. .replaceAll("\u2156", "2/5")
  149. .replaceAll("\u2157", "3/5")
  150. .replaceAll("\u2158", "4/5")
  151. .replaceAll("\u2159", "1/6")
  152. .replaceAll("\u215A", "5/6")
  153. .replaceAll("\u215B", "1/8")
  154. .replaceAll("\u215C", "3/8")
  155. .replaceAll("\u215D", "5/8")
  156. .replaceAll("\u215E", "7/8")
  157. .replaceAll("\u215F", "1/")
  158. .replaceAll("\u00F1", "n")
  159. .trim
  160. ))
  161. .map(_.map(_parseIngredient _)),
  162. texts("div.wprm-recipe-instruction-text")
  163. )
  164. val tastyRecipes = Parser(
  165. text("h2.tasty-recipes-title"),
  166. text("span.tasty-recipes-yield")
  167. .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  168. Some(text("span.tasty-recipes-prep-time")),
  169. Some(text("span.tasty-recipes-cook-time")),
  170. elementList("div.tasty-recipes-ingredients-body > ul > li").map(
  171. _.map({(listItem) => (
  172. ((listItem >?> elementList("span"))
  173. .map(_.last)
  174. .fold(0.0f)((elm: Element) =>
  175. (elm >?> attr("data-amount"))
  176. .fold(0.0f)(_.toFloat)
  177. )
  178. ),
  179. (listItem >?> elementList("span"))
  180. .map(_.last)
  181. .fold[MeasureUnit](Gram)((elm: Element) =>
  182. (elm >?> attr("data-unit"))
  183. .flatMap(MeasureUnit.guessUnit _)
  184. .getOrElse(Count)
  185. ),
  186. (listItem >?> text("strong")).getOrElse(listItem.ownText)
  187. )})
  188. ),
  189. texts("div.tasty-recipes-instructions-body > ol > li")
  190. )
  191. val seriousEats = Parser(
  192. text("h2.recipe-decision-block__title"),
  193. text("div.recipe-serving > span > span.meta-text__data")
  194. .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  195. //text("div.recipe-yield > span > span.meta-text__data")
  196. Some(text("div.prep-time > span > span.meta-text__data")),
  197. None, //Some(text("span.tasty-recipes-cook-time")),
  198. elementList("ul.structured-ingredients__list > li > p").map(
  199. _.map({(p) => (
  200. ((p >?> elementList("span"))
  201. .flatMap(_
  202. .filter((s) => (s >?> attr("data-ingredient-quantity")).isDefined)
  203. .lastOption
  204. .map(_ >> text)
  205. ).flatMap(_parseFraction _)
  206. .getOrElse(0.0f)
  207. ),
  208. ((p >?> elementList("span"))
  209. .flatMap(_
  210. .filter((s) => (s >?> attr("data-ingredient-unit")).isDefined)
  211. .lastOption
  212. .map(_ >> text)
  213. ).flatMap(MeasureUnit.guessUnit _)
  214. .getOrElse(Count)
  215. ),
  216. ((p >?> elementList("span"))
  217. .flatMap(_
  218. .filter((s) => (s >?> attr("data-ingredient-name")).isDefined)
  219. .headOption
  220. ).getOrElse(p).ownText
  221. )
  222. )})
  223. ),
  224. texts("div.structured-project__steps_1-0 > ol > li > p")
  225. )
  226. val greatist = Parser(
  227. text("h1"),
  228. elementList("article.article-body > ul > li").map(
  229. _.filter((listItem) => (listItem >?> text("strong")) == Some("Yield"))
  230. .map(_ >> text)
  231. .head
  232. ).map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  233. // tflucke@[2023-11-28]: TODO They don't give passive, only Active + Total
  234. None,
  235. Some(
  236. elementList("article.article-body > ul > li").map(
  237. _.filter((listItem) => (listItem >?> text("strong")) == Some("Active"))
  238. .map(_ >> text)
  239. .head
  240. ).map("Active: \\D*(\\d+).*".r.findFirstMatchIn(_).fold("0")(_.group(1)))
  241. ),
  242. elementList("article.article-body > ul > li").map(_
  243. .filter((listItem) => (listItem >?> text("strong")) == None)
  244. .map(_ >> text)
  245. .map(_.replaceAll("\u00F1", "n"))
  246. .map(_parseIngredient _)
  247. ),
  248. texts("article.article-body > ol > li")
  249. )
  250. val dimitrasDishes = Parser(
  251. text("h2.mv-create-title-primary"),
  252. text("div.mv-create-time-yield > span").map(_.toFloatOption),
  253. None,
  254. None,
  255. texts("div.mv-create-ingredients > ul > li").map(
  256. _.map(_
  257. .replace("and", "")
  258. .replaceAll("\u00BD", "1/2")
  259. .replaceAll("\u00BC", "1/4")
  260. .replaceAll("\u00BE", "3/4")
  261. .replaceAll("\u2150", "1/7")
  262. .replaceAll("\u2151", "1/9")
  263. .replaceAll("\u2152", "1/10")
  264. .replaceAll("\u2153", "1/3")
  265. .replaceAll("\u2154", "2/3")
  266. .replaceAll("\u2155", "1/5")
  267. .replaceAll("\u2156", "2/5")
  268. .replaceAll("\u2157", "3/5")
  269. .replaceAll("\u2158", "4/5")
  270. .replaceAll("\u2159", "1/6")
  271. .replaceAll("\u215A", "5/6")
  272. .replaceAll("\u215B", "1/8")
  273. .replaceAll("\u215C", "3/8")
  274. .replaceAll("\u215D", "5/8")
  275. .replaceAll("\u215E", "7/8")
  276. .replaceAll("\u215F", "1/")
  277. .replaceAll("\u00F1", "n")
  278. .trim
  279. ).map(_parseIngredient _)
  280. ),
  281. texts("div.mv-create-instructions > ol > li")
  282. )
  283. private def _parseFraction(fractionLine: String) = {
  284. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*".r
  285. val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)[\d-_]*".r
  286. fractionLine match {
  287. case fractionPattern(numerator, denominator) =>
  288. Some(numerator.toFloat/denominator.toFloat)
  289. case mixedFractionPattern(whole, numerator, denominator) =>
  290. Some(whole.toFloat + numerator.toFloat/denominator.toFloat)
  291. case _ => fractionLine.toFloatOption
  292. }
  293. }
  294. private def _parseIngredient(
  295. ingredientLine: String
  296. ): (Float, MeasureUnit, String) = {
  297. val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  298. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  299. val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)\s(\w+)\s+(.+)".r
  300. ingredientLine match {
  301. case mixedFractionPattern(whole, numerator, denominator, unit, rest) =>
  302. (
  303. whole.toFloat + numerator.toFloat/denominator.toFloat,
  304. MeasureUnit.guessUnit(unit).getOrElse(Count),
  305. rest
  306. )
  307. case fractionPattern(numerator, denominator, unit, rest) =>
  308. (
  309. numerator.toFloat/denominator.toFloat,
  310. MeasureUnit.guessUnit(unit).getOrElse(Count),
  311. rest
  312. )
  313. case numberPattern(amount, unit, rest) =>
  314. (amount.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count), rest)
  315. case noUnitLine =>
  316. (1, Count, noUnitLine)
  317. }
  318. }
  319. }