ParserController.scala 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. package com.weEat.controllers
  2. import com.weEat.shared.models._
  3. import javax.inject.{Inject,Singleton}
  4. import play.api.libs.json._
  5. import play.api.mvc._
  6. import scala.concurrent.Future
  7. import com.weEat.models.Authorization
  8. import scalaoauth2.provider.{AuthInfoRequest,OAuth2ProviderActionBuilders}
  9. import com.weEat.services.OAuth2Service
  10. import net.ruippeixotog.scalascraper.browser.JsoupBrowser
  11. import net.ruippeixotog.scalascraper.dsl.DSL._
  12. import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
  13. //import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
  14. import net.ruippeixotog.scalascraper.model.{Document,Element}
  15. import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
  16. import scala.util._
  17. import scala.concurrent.ExecutionContext
  18. @Singleton
  19. class ParserController @Inject()(
  20. val controllerComponents: ControllerComponents,
  21. oauth: OAuth2Service,
  22. usdaController: USDAController,
  23. foodController: FoodController
  24. ) extends BaseController
  25. with OAuth2ProviderActionBuilders {
  26. implicit val ec = scala.concurrent.ExecutionContext.global
  27. private val _browser = JsoupBrowser()
  28. def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
  29. val url = request.body
  30. val host = new java.net.URL(url).getAuthority()
  31. val hostNoWWW =
  32. if (host.startsWith("www.")) host.substring("www.".length) else host
  33. val doc = _browser.get(url)
  34. Parser(usdaController, foodController)(hostNoWWW, doc).transformWith {
  35. case Success(Some(parser)) => parser(usdaController, foodController)(doc, url)
  36. .map((food) => Ok(Json.toJson(food)))
  37. case Success(None) => Future.successful(NotFound(f"No parser found for host $hostNoWWW"))
  38. case Failure(e) => throw e
  39. }
  40. })
  41. }
  42. case class Parser(
  43. titleExtractor: HtmlExtractor[Element, String],
  44. servingExtractor: HtmlExtractor[Element, Option[Float]],
  45. prepTimeExtractor: Option[HtmlExtractor[Element, String]],
  46. cookTimeExtractor: Option[HtmlExtractor[Element, String]],
  47. ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
  48. instructionExtractor: HtmlExtractor[Element, Iterable[String]],
  49. )(usdaC: USDAController, foodC: FoodController) {
  50. implicit val ec = scala.concurrent.ExecutionContext.global
  51. def apply(doc: Document, url: String): Future[RecipeNodeNoId] = {
  52. Future({
  53. val title = doc >> titleExtractor
  54. val servings = (doc >?> servingExtractor).flatten
  55. val prepTime = prepTimeExtractor.flatMap(doc >?> _)
  56. val cookTime = cookTimeExtractor.flatMap(doc >?> _)
  57. val ingredients = doc >> ingredientExtractor
  58. val instructions = doc >> instructionExtractor
  59. Future.sequence(ingredients.map({
  60. case (amt, u, line) => _guessFoodFromStr(line).map(Ingredient(_, amt, u))
  61. }))
  62. .map((ingredients) => RecipeNodeNoId(
  63. title,
  64. servings.getOrElse(1.0f),
  65. 1.0f,
  66. UnitType.NUMBER,
  67. ingredients.toSeq,
  68. /* tflucke@[2023-10-26]: Do not pass along the instructions since this
  69. * could be a violation of the Recipe Author's copyright. */
  70. Nil, //instructions.toSeq,
  71. None,
  72. None,
  73. Some(url),
  74. None
  75. ))
  76. }).flatten
  77. }
  78. private def _guessFoodFromStr(
  79. foodLine: String
  80. ): Future[Ingredient.IngredientId] = {
  81. val foodLineFiltered = foodLine
  82. .filter(_ <= 0x7f)
  83. .filterNot(Set.from("!:/-").contains)
  84. searchFdcIndex(foodLineFiltered).transformWith {
  85. case Success(Some(ingredientId)) => Future.successful(ingredientId)
  86. case Success(None) => searchSelfIndex(foodLineFiltered)
  87. case Failure(e) => Future.failed(e)
  88. }
  89. }
  90. def searchFdcIndex(foodLine: String): Future[Option[Ingredient.IngredientId]] = {
  91. import gov.usda.nal.fdc.models.DataType._
  92. import gov.usda.nal.fdc.models.SearchResult
  93. usdaC.fdc.getFoodsSearch(foodLine, Seq(
  94. // Branded,
  95. Foundation, SRLegacy
  96. ), pageSize = Some(10))().flatMap({
  97. case SearchResult(_, _, _, _, Nil) => Future.successful(None)
  98. case SearchResult(_, _, _, _, foods) =>
  99. Future.sequence(
  100. foods.map((food) => foodC.getByFdcId(food.fdcId))
  101. ).map(_.flatten
  102. .headOption
  103. .fold[Ingredient.IngredientId](
  104. Ingredient.USDAId(foods.head.fdcId)
  105. )((foodNode) => Ingredient.FoodNodeId(foodNode._id))
  106. ).map(Some(_))
  107. }).recover {
  108. case e: com.tflucke.webroutes.HTTPException if e.statusCode == 500 =>
  109. println(s"USDA database failed to parse line: '$foodLine'")
  110. throw e
  111. }
  112. }
  113. def searchSelfIndex(foodLine: String): Future[Ingredient.IngredientId] = {
  114. foodC.findByName(foodLine)
  115. .transform {
  116. case Success(Nil) =>
  117. Failure(new NoSuchElementException(foodLine))
  118. case Success(foodNode::rest) =>
  119. Success(Ingredient.FoodNodeId(foodNode._id))
  120. case Success(_) => ???
  121. case Failure(e) => Failure(e)
  122. }
  123. }
  124. }
  125. object Parser {
  126. type ParserFn = (USDAController, FoodController) => Parser
  127. private val knownParsers = Map(
  128. ("epicurious.com" -> Parser.epicurious),
  129. ("mccormick.com" -> Parser.mccormick),
  130. ("recipetineats.com" -> Parser.recipeTinEats),
  131. ("mamalovestocook.com" -> Parser.recipeTinEats),
  132. ("soulfullymade.com" -> Parser.recipeTinEats),
  133. ("familycookierecipes.com" -> Parser.recipeTinEats),
  134. ("familyfreshmeals.com" -> Parser.recipeTinEats),
  135. ("handmadefarmhouse.com" -> Parser.recipeTinEats),
  136. ("tastesoflizzyt.com" -> Parser.recipeTinEats),
  137. ("omnivorescookbook.com" -> Parser.recipeTinEats),
  138. ("growforagecookferment.com" -> Parser.recipeTinEats),
  139. ("joyfoodsunshine.com" -> Parser.recipeTinEats),
  140. ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
  141. ("darngoodveggies.com" -> Parser.tastyRecipes),
  142. ("pickledplum.com" -> Parser.tastyRecipes),
  143. ("iheartvegetables.com" -> Parser.tastyRecipes),
  144. ("seriouseats.com" -> Parser.seriousEats),
  145. ("greatist.com" -> Parser.greatist),
  146. ("dimitrasdishes.com" -> Parser.dimitrasDishes),
  147. ("jif.com" -> Parser.jif),
  148. ("kingarthurbaking.com" -> Parser.kingArthurBaking)
  149. )
  150. private val frequentParsers = Seq.from(
  151. knownParsers.values.groupMapReduce(p => p)(_ => 1)((a, b) => a + b)
  152. ).sortBy({ case (_, c) => -c })
  153. .map(p => p._1)
  154. def apply(usdaC: USDAController, foodC: FoodController)(
  155. url: String,
  156. doc: Document
  157. )(implicit ec: ExecutionContext): Future[Option[ParserFn]] = {
  158. knownParsers.get(url).fold[Future[Option[ParserFn]]](Future.find(frequentParsers.map { (pfn) =>
  159. // convert the ParserFn to a Future[ParserFn] based on if it's a match
  160. pfn(usdaC, foodC)(doc, url)
  161. .filter(_.ingredients.size > 0)
  162. .map((_) => pfn)
  163. })(_ => true)) { (parser) =>
  164. Future.successful(Some(parser))
  165. }
  166. }
  167. def mccormick: ParserFn = Parser(
  168. text("h1"),
  169. // TODO use extractors
  170. text(".main-title .count").map(_.toFloatOption),
  171. Some(text(".prep_time .first_content")),
  172. cookTimeExtractor = Some(text(".ingredients .first_content")),
  173. ingredientExtractor = texts(".recipe-about-list li").map(
  174. _.map(_parseIngredient _)
  175. ),
  176. texts(".instructions-main span.para")
  177. ) _
  178. def epicurious: ParserFn = Parser(
  179. text("h1"),
  180. text("""div[data-testid="IngredientList"] > p""")
  181. .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  182. None,
  183. None,
  184. texts("""div[data-testid="IngredientList"] > div > div""").map(
  185. _.map(_parseIngredient _)
  186. ),
  187. texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
  188. ) _
  189. def recipeTinEats: ParserFn = Parser(
  190. text("h2.wprm-recipe-name"),
  191. text("span.wprm-recipe-servings").map(_.toFloatOption),
  192. Some(text("span.wprm-recipe-prep_time-minutes")),
  193. Some(text("span.wprm-recipe-cook_time-minutes")),
  194. elementList("li.wprm-recipe-ingredient").map(_.map({ (li) => (
  195. (li >?> text("span.wprm-recipe-ingredient-amount")
  196. .map(_
  197. .replaceAll("\u00BD", "1/2")
  198. .replaceAll("\u00BC", "1/4")
  199. .replaceAll("\u00BE", "3/4")
  200. .replaceAll("\u2150", "1/7")
  201. .replaceAll("\u2151", "1/9")
  202. .replaceAll("\u2152", "1/10")
  203. .replaceAll("\u2153", "1/3")
  204. .replaceAll("\u2154", "2/3")
  205. .replaceAll("\u2155", "1/5")
  206. .replaceAll("\u2156", "2/5")
  207. .replaceAll("\u2157", "3/5")
  208. .replaceAll("\u2158", "4/5")
  209. .replaceAll("\u2159", "1/6")
  210. .replaceAll("\u215A", "5/6")
  211. .replaceAll("\u215B", "1/8")
  212. .replaceAll("\u215C", "3/8")
  213. .replaceAll("\u215D", "5/8")
  214. .replaceAll("\u215E", "7/8")
  215. .replaceAll("\u215F", "1/")
  216. ))
  217. .flatMap(_parseFraction _)
  218. .getOrElse(0.0f),
  219. (li >?> text("span.wprm-recipe-ingredient-unit"))
  220. .flatMap(MeasureUnit.guessUnit _)
  221. .getOrElse(Count),
  222. li >> text("span.wprm-recipe-ingredient-name")
  223. .map(_.replaceAll("\u00F1", "n"))
  224. ) })),
  225. texts("div.wprm-recipe-instruction-text")
  226. ) _
  227. def tastyRecipes: ParserFn = Parser(
  228. text("h2.tasty-recipes-title"),
  229. text("span.tasty-recipes-yield")
  230. .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  231. Some(text("span.tasty-recipes-prep-time")),
  232. Some(text("span.tasty-recipes-cook-time")),
  233. elementList("div.tasty-recipes-ingredients-body > ul > li").map(
  234. _.map({(listItem) => (
  235. ((listItem >?> elementList("span"))
  236. .flatMap(_.lastOption)
  237. .fold(0.0f)((elm: Element) =>
  238. (elm >?> attr("data-amount"))
  239. .fold(0.0f)(_.toFloat)
  240. )
  241. ),
  242. (listItem >?> elementList("span"))
  243. .flatMap(_.lastOption)
  244. .fold[MeasureUnit](Gram)((elm: Element) =>
  245. (elm >?> attr("data-unit"))
  246. .flatMap(MeasureUnit.guessUnit _)
  247. .getOrElse(Count)
  248. ),
  249. (listItem >?> text("strong"))
  250. .filterNot(_.contains("optional"))
  251. .getOrElse(listItem.ownText)
  252. )})
  253. ),
  254. texts("div.tasty-recipes-instructions-body > ol > li")
  255. ) _
  256. def seriousEats: ParserFn = Parser(
  257. text("h2.recipe-decision-block__title"),
  258. text("div.recipe-serving > span > span.meta-text__data")
  259. .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  260. //text("div.recipe-yield > span > span.meta-text__data")
  261. Some(text("div.prep-time > span > span.meta-text__data")),
  262. None, //Some(text("span.tasty-recipes-cook-time")),
  263. elementList("ul.structured-ingredients__list > li > p").map(
  264. _.map({(p) => (
  265. ((p >?> elementList("span"))
  266. .flatMap(_
  267. .filter((s) => (s >?> attr("data-ingredient-quantity")).isDefined)
  268. .lastOption
  269. .map(_ >> text)
  270. ).flatMap(_parseFraction _)
  271. .getOrElse(0.0f)
  272. ),
  273. ((p >?> elementList("span"))
  274. .flatMap(_
  275. .filter((s) => (s >?> attr("data-ingredient-unit")).isDefined)
  276. .lastOption
  277. .map(_ >> text)
  278. ).flatMap(MeasureUnit.guessUnit _)
  279. .getOrElse(Count)
  280. ),
  281. ((p >?> elementList("span"))
  282. .flatMap(_
  283. .filter((s) => (s >?> attr("data-ingredient-name")).isDefined)
  284. .headOption
  285. ).getOrElse(p).ownText
  286. )
  287. )})
  288. ),
  289. texts("div.structured-project__steps_1-0 > ol > li > p")
  290. ) _
  291. def greatist: ParserFn = Parser(
  292. text("h1"),
  293. elementList("article.article-body > ul > li").map(
  294. _.filter((listItem) => (listItem >?> text("strong")) == Some("Yield"))
  295. .map(_ >> text)
  296. .head
  297. ).map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  298. // tflucke@[2023-11-28]: TODO They don't give passive, only Active + Total
  299. None,
  300. Some(
  301. elementList("article.article-body > ul > li").map(
  302. _.filter((listItem) => (listItem >?> text("strong")) == Some("Active"))
  303. .map(_ >> text)
  304. .head
  305. ).map("Active: \\D*(\\d+).*".r.findFirstMatchIn(_).fold("0")(_.group(1)))
  306. ),
  307. elementList("article.article-body > ul > li").map(_
  308. .filter((listItem) => (listItem >?> text("strong")) == None)
  309. .map(_ >> text)
  310. .map(_.replaceAll("\u00F1", "n"))
  311. .map(_parseIngredient _)
  312. ),
  313. texts("article.article-body > ol > li")
  314. ) _
  315. def dimitrasDishes: ParserFn = Parser(
  316. text("h2.mv-create-title-primary"),
  317. text("div.mv-create-time-yield > span").map(_.toFloatOption),
  318. None,
  319. None,
  320. texts("div.mv-create-ingredients > ul > li").map(
  321. _.map(_
  322. .replace("and", "")
  323. .replaceAll("\u00BD", "1/2")
  324. .replaceAll("\u00BC", "1/4")
  325. .replaceAll("\u00BE", "3/4")
  326. .replaceAll("\u2150", "1/7")
  327. .replaceAll("\u2151", "1/9")
  328. .replaceAll("\u2152", "1/10")
  329. .replaceAll("\u2153", "1/3")
  330. .replaceAll("\u2154", "2/3")
  331. .replaceAll("\u2155", "1/5")
  332. .replaceAll("\u2156", "2/5")
  333. .replaceAll("\u2157", "3/5")
  334. .replaceAll("\u2158", "4/5")
  335. .replaceAll("\u2159", "1/6")
  336. .replaceAll("\u215A", "5/6")
  337. .replaceAll("\u215B", "1/8")
  338. .replaceAll("\u215C", "3/8")
  339. .replaceAll("\u215D", "5/8")
  340. .replaceAll("\u215E", "7/8")
  341. .replaceAll("\u215F", "1/")
  342. .replaceAll("\u00F1", "n")
  343. .trim
  344. ).map(_parseIngredient _)
  345. ),
  346. texts("div.mv-create-instructions > ol > li")
  347. ) _
  348. def jif: ParserFn = Parser(
  349. text("h1.recipe-name"),
  350. elementList("div.recipe-breakdown-step").map(
  351. _.filter((listItem) => (listItem >?> text("i.servings")).isDefined)
  352. .map(_ >> text("span.recipe-breakdown-detail"))
  353. .head
  354. ).map(_.toFloatOption),
  355. Some(elementList("div.recipe-breakdown-step").map(
  356. _.filter((listItem) => (listItem >?> text("i.prep")).isDefined)
  357. .map(_ >> text("span.recipe-breakdown-detail"))
  358. .head
  359. )),
  360. Some(elementList("div.recipe-breakdown-step").map(
  361. _.filter((listItem) => (listItem >?> text("i.cook")).isDefined)
  362. .map(_ >> text("span.recipe-breakdown-detail"))
  363. .head
  364. )),
  365. texts("div.recipe-ingredients > ul > li")
  366. .map(_.map(_
  367. .replaceAll("\u00BD", "1/2")
  368. .replaceAll("\u00BC", "1/4")
  369. .replaceAll("\u00BE", "3/4")
  370. .replaceAll("\u2150", "1/7")
  371. .replaceAll("\u2151", "1/9")
  372. .replaceAll("\u2152", "1/10")
  373. .replaceAll("\u2153", "1/3")
  374. .replaceAll("\u2154", "2/3")
  375. .replaceAll("\u2155", "1/5")
  376. .replaceAll("\u2156", "2/5")
  377. .replaceAll("\u2157", "3/5")
  378. .replaceAll("\u2158", "4/5")
  379. .replaceAll("\u2159", "1/6")
  380. .replaceAll("\u215A", "5/6")
  381. .replaceAll("\u215B", "1/8")
  382. .replaceAll("\u215C", "3/8")
  383. .replaceAll("\u215D", "5/8")
  384. .replaceAll("\u215E", "7/8")
  385. .replaceAll("\u215F", "1/")
  386. .replaceAll("\u00F1", "n")
  387. .trim
  388. ))
  389. .map(_.map(_parseIngredient _)),
  390. texts("div.recipe-directions > ul > li > p")
  391. ) _
  392. def kingArthurBaking: ParserFn = Parser(
  393. text("h1 > span"),
  394. text("div.stat__item--yield > span").map(_.toFloatOption),
  395. Some(text("div.stat__item--prep > span")),
  396. Some(text("div.stat__item--bake > span")),
  397. texts("div.ingredient-section > ul > li")
  398. .map(_.map(_
  399. .replaceAll("\u00BD", "1/2")
  400. .replaceAll("\u00BC", "1/4")
  401. .replaceAll("\u00BE", "3/4")
  402. .replaceAll("\u2150", "1/7")
  403. .replaceAll("\u2151", "1/9")
  404. .replaceAll("\u2152", "1/10")
  405. .replaceAll("\u2153", "1/3")
  406. .replaceAll("\u2154", "2/3")
  407. .replaceAll("\u2155", "1/5")
  408. .replaceAll("\u2156", "2/5")
  409. .replaceAll("\u2157", "3/5")
  410. .replaceAll("\u2158", "4/5")
  411. .replaceAll("\u2159", "1/6")
  412. .replaceAll("\u215A", "5/6")
  413. .replaceAll("\u215B", "1/8")
  414. .replaceAll("\u215C", "3/8")
  415. .replaceAll("\u215D", "5/8")
  416. .replaceAll("\u215E", "7/8")
  417. .replaceAll("\u215F", "1/")
  418. .replaceAll("\u00F1", "n")
  419. .trim
  420. ))
  421. .map(_.map(_parseIngredient _)),
  422. texts("div.field field--recipe-steps > ol > li > p")
  423. ) _
  424. private def _parseFraction(fractionLine: String) = {
  425. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*".r
  426. val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)[\d-_]*".r
  427. fractionLine match {
  428. case fractionPattern(numerator, denominator) =>
  429. Some(numerator.toFloat/denominator.toFloat)
  430. case mixedFractionPattern(whole, numerator, denominator) =>
  431. Some(whole.toFloat + numerator.toFloat/denominator.toFloat)
  432. case _ => fractionLine.toFloatOption
  433. }
  434. }
  435. private def _parseIngredient(
  436. ingredientLine: String
  437. ): (Float, MeasureUnit, String) = {
  438. val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  439. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  440. val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)\s(\w+)\s+(.+)".r
  441. ingredientLine match {
  442. case mixedFractionPattern(whole, numerator, denominator, unit, rest) =>
  443. (
  444. whole.toFloat + numerator.toFloat/denominator.toFloat,
  445. MeasureUnit.guessUnit(unit).getOrElse(Count),
  446. rest
  447. )
  448. case fractionPattern(numerator, denominator, unit, rest) =>
  449. (
  450. numerator.toFloat/denominator.toFloat,
  451. MeasureUnit.guessUnit(unit).getOrElse(Count),
  452. rest
  453. )
  454. case numberPattern(amount, unit, rest) =>
  455. (amount.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count), rest)
  456. case noUnitLine =>
  457. (1, Count, noUnitLine)
  458. }
  459. }
  460. }