ParserController.scala 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. package com.weEat.controllers
  2. import com.weEat.shared.models._
  3. import javax.inject.{Inject,Singleton}
  4. import play.api.libs.json._
  5. import play.api.mvc._
  6. import scala.concurrent.Future
  7. import com.weEat.models.Authorization
  8. import scalaoauth2.provider.{AuthInfoRequest,OAuth2ProviderActionBuilders}
  9. import com.weEat.services.OAuth2Service
  10. import net.ruippeixotog.scalascraper.browser.JsoupBrowser
  11. import net.ruippeixotog.scalascraper.dsl.DSL._
  12. import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
  13. //import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
  14. import net.ruippeixotog.scalascraper.model.Element
  15. import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
  16. import scala.util._
  17. @Singleton
  18. class ParserController @Inject()(
  19. val controllerComponents: ControllerComponents,
  20. oauth: OAuth2Service,
  21. usdaController: USDAController,
  22. foodController: FoodController
  23. ) extends BaseController
  24. with OAuth2ProviderActionBuilders {
  25. implicit val ec = scala.concurrent.ExecutionContext.global
  26. private val _browser = JsoupBrowser()
  27. def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
  28. val url = request.body
  29. _findParser(url).fold(
  30. Future.successful(NotFound(s"No parser available for $url."))
  31. ) { (parser) =>
  32. val doc = _browser.get(url)
  33. val title = doc >> parser.titleExtractor
  34. val servings = (doc >?> parser.servingExtractor).flatten
  35. val prepTime = parser.prepTimeExtractor.flatMap(doc >?> _)
  36. val cookTime = parser.cookTimeExtractor.flatMap(doc >?> _)
  37. val ingredients = doc >> parser.ingredientExtractor
  38. val instructions = doc >> parser.instructionExtractor
  39. Future.sequence(ingredients.map({
  40. case (amt, u, line) => _guessFoodFromStr(line).map(Ingredient(_, amt, u))
  41. }))
  42. .map((ingredients) => Ok(Json.toJson(RecipeNodeNoId(
  43. title,
  44. servings.getOrElse(1.0f),
  45. 1.0f,
  46. UnitType.NUMBER,
  47. ingredients.toSeq,
  48. /* tflucke@[2023-10-26]: Do not pass along the instructions since this
  49. * could be a violation of the Recipe Author's copyright. */
  50. Nil, //instructions.toSeq,
  51. None,
  52. None,
  53. Some(url),
  54. None
  55. ))))
  56. }
  57. })
  58. private def _findParser(url: String): Option[Parser] = {
  59. val host = new java.net.URL(url).getAuthority()
  60. val hostNoWWW =
  61. if (host.startsWith("www.")) host.substring("www.".length) else host
  62. Map(
  63. ("epicurious.com" -> Parser.epicurious),
  64. ("mccormick.com" -> Parser.mccormick),
  65. ("recipetineats.com" -> Parser.recipeTinEats),
  66. ("mamalovestocook.com" -> Parser.recipeTinEats),
  67. ("soulfullymade.com" -> Parser.recipeTinEats),
  68. ("familycookierecipes.com" -> Parser.recipeTinEats),
  69. ("familyfreshmeals.com" -> Parser.recipeTinEats),
  70. ("handmadefarmhouse.com" -> Parser.recipeTinEats),
  71. ("tastesoflizzyt.com" -> Parser.recipeTinEats),
  72. ("omnivorescookbook.com" -> Parser.recipeTinEats),
  73. ("growforagecookferment.com" -> Parser.recipeTinEats),
  74. ("joyfoodsunshine.com" -> Parser.recipeTinEats),
  75. ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
  76. ("darngoodveggies.com" -> Parser.tastyRecipes),
  77. ("pickledplum.com" -> Parser.tastyRecipes),
  78. ("iheartvegetables.com" -> Parser.tastyRecipes),
  79. ("seriouseats.com" -> Parser.seriousEats),
  80. ("greatist.com" -> Parser.greatist),
  81. ("dimitrasdishes.com" -> Parser.dimitrasDishes),
  82. ("jif.com" -> Parser.jif),
  83. ("kingarthurbaking.com" -> Parser.kingArthurBaking)
  84. ).get(hostNoWWW)
  85. }
  86. private def _guessFoodFromStr(
  87. foodLine: String
  88. ): Future[Ingredient.IngredientId] = {
  89. val foodLineFiltered = foodLine
  90. .filter(_ <= 0x7f)
  91. .filterNot(_ == '!')
  92. .filterNot(_ == ':')
  93. .filterNot(_ == '/')
  94. searchFdcIndex(foodLineFiltered).transformWith {
  95. case Success(Some(ingredientId)) => Future.successful(ingredientId)
  96. case Success(None) => searchSelfIndex(foodLineFiltered)
  97. case Failure(e) => Future.failed(e)
  98. }
  99. }
  100. def searchFdcIndex(foodLine: String): Future[Option[Ingredient.IngredientId]] = {
  101. import gov.usda.nal.fdc.models.DataType._
  102. import gov.usda.nal.fdc.models.SearchResult
  103. usdaController.fdc.getFoodsSearch(foodLine, Seq(
  104. // Branded,
  105. Foundation, SRLegacy
  106. ), pageSize = Some(10))().flatMap({
  107. case SearchResult(_, _, _, _, Nil) => Future.successful(None)
  108. case SearchResult(_, _, _, _, foods) =>
  109. Future.sequence(
  110. foods.map((food) => foodController.getByFdcId(food.fdcId))
  111. ).map(_.flatten
  112. .headOption
  113. .fold[Ingredient.IngredientId](
  114. Ingredient.USDAId(foods.head.fdcId)
  115. )((foodNode) => Ingredient.FoodNodeId(foodNode._id))
  116. ).map(Some(_))
  117. })
  118. }
  119. def searchSelfIndex(foodLine: String): Future[Ingredient.IngredientId] = {
  120. foodController.findByName(foodLine)
  121. .transform {
  122. case Success(Nil) =>
  123. Failure(new NoSuchElementException(foodLine))
  124. case Success(foodNode::rest) =>
  125. Success(Ingredient.FoodNodeId(foodNode._id))
  126. case Failure(e) => Failure(e)
  127. }
  128. }
  129. }
  130. case class Parser(
  131. titleExtractor: HtmlExtractor[Element, String],
  132. servingExtractor: HtmlExtractor[Element, Option[Float]],
  133. prepTimeExtractor: Option[HtmlExtractor[Element, String]],
  134. cookTimeExtractor: Option[HtmlExtractor[Element, String]],
  135. ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
  136. instructionExtractor: HtmlExtractor[Element, Iterable[String]],
  137. )
  138. object Parser {
  139. val mccormick = Parser(
  140. text("h1"),
  141. // TODO use extractors
  142. text(".main-title .count").map(_.toFloatOption),
  143. Some(text(".prep_time .first_content")),
  144. cookTimeExtractor = Some(text(".ingredients .first_content")),
  145. ingredientExtractor = texts(".recipe-about-list li").map(
  146. _.map(_parseIngredient _)
  147. ),
  148. texts(".instructions-main span.para")
  149. )
  150. val epicurious = Parser(
  151. text("h1"),
  152. text("""div[data-testid="IngredientList"] > p""")
  153. .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  154. None,
  155. None,
  156. texts("""div[data-testid="IngredientList"] > div > div""").map(
  157. _.map(_parseIngredient _)
  158. ),
  159. texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
  160. )
  161. val recipeTinEats = Parser(
  162. text("h2.wprm-recipe-name"),
  163. text("span.wprm-recipe-servings").map(_.toFloatOption),
  164. Some(text("span.wprm-recipe-prep_time-minutes")),
  165. Some(text("span.wprm-recipe-cook_time-minutes")),
  166. elementList("li.wprm-recipe-ingredient").map(_.map({ (li) => (
  167. (li >?> text("span.wprm-recipe-ingredient-amount")
  168. .map(_
  169. .replaceAll("\u00BD", "1/2")
  170. .replaceAll("\u00BC", "1/4")
  171. .replaceAll("\u00BE", "3/4")
  172. .replaceAll("\u2150", "1/7")
  173. .replaceAll("\u2151", "1/9")
  174. .replaceAll("\u2152", "1/10")
  175. .replaceAll("\u2153", "1/3")
  176. .replaceAll("\u2154", "2/3")
  177. .replaceAll("\u2155", "1/5")
  178. .replaceAll("\u2156", "2/5")
  179. .replaceAll("\u2157", "3/5")
  180. .replaceAll("\u2158", "4/5")
  181. .replaceAll("\u2159", "1/6")
  182. .replaceAll("\u215A", "5/6")
  183. .replaceAll("\u215B", "1/8")
  184. .replaceAll("\u215C", "3/8")
  185. .replaceAll("\u215D", "5/8")
  186. .replaceAll("\u215E", "7/8")
  187. .replaceAll("\u215F", "1/")
  188. ))
  189. .flatMap(_parseFraction _)
  190. .getOrElse(0.0f),
  191. (li >?> text("span.wprm-recipe-ingredient-unit"))
  192. .flatMap(MeasureUnit.guessUnit _)
  193. .getOrElse(Count),
  194. li >> text("span.wprm-recipe-ingredient-name")
  195. .map(_.replaceAll("\u00F1", "n"))
  196. ) })),
  197. texts("div.wprm-recipe-instruction-text")
  198. )
  199. val tastyRecipes = Parser(
  200. text("h2.tasty-recipes-title"),
  201. text("span.tasty-recipes-yield")
  202. .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  203. Some(text("span.tasty-recipes-prep-time")),
  204. Some(text("span.tasty-recipes-cook-time")),
  205. elementList("div.tasty-recipes-ingredients-body > ul > li").map(
  206. _.map({(listItem) => (
  207. ((listItem >?> elementList("span"))
  208. .flatMap(_.lastOption)
  209. .fold(0.0f)((elm: Element) =>
  210. (elm >?> attr("data-amount"))
  211. .fold(0.0f)(_.toFloat)
  212. )
  213. ),
  214. (listItem >?> elementList("span"))
  215. .flatMap(_.lastOption)
  216. .fold[MeasureUnit](Gram)((elm: Element) =>
  217. (elm >?> attr("data-unit"))
  218. .flatMap(MeasureUnit.guessUnit _)
  219. .getOrElse(Count)
  220. ),
  221. (listItem >?> text("strong"))
  222. .filterNot(_.contains("optional"))
  223. .getOrElse(listItem.ownText)
  224. )})
  225. ),
  226. texts("div.tasty-recipes-instructions-body > ol > li")
  227. )
  228. val seriousEats = Parser(
  229. text("h2.recipe-decision-block__title"),
  230. text("div.recipe-serving > span > span.meta-text__data")
  231. .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  232. //text("div.recipe-yield > span > span.meta-text__data")
  233. Some(text("div.prep-time > span > span.meta-text__data")),
  234. None, //Some(text("span.tasty-recipes-cook-time")),
  235. elementList("ul.structured-ingredients__list > li > p").map(
  236. _.map({(p) => (
  237. ((p >?> elementList("span"))
  238. .flatMap(_
  239. .filter((s) => (s >?> attr("data-ingredient-quantity")).isDefined)
  240. .lastOption
  241. .map(_ >> text)
  242. ).flatMap(_parseFraction _)
  243. .getOrElse(0.0f)
  244. ),
  245. ((p >?> elementList("span"))
  246. .flatMap(_
  247. .filter((s) => (s >?> attr("data-ingredient-unit")).isDefined)
  248. .lastOption
  249. .map(_ >> text)
  250. ).flatMap(MeasureUnit.guessUnit _)
  251. .getOrElse(Count)
  252. ),
  253. ((p >?> elementList("span"))
  254. .flatMap(_
  255. .filter((s) => (s >?> attr("data-ingredient-name")).isDefined)
  256. .headOption
  257. ).getOrElse(p).ownText
  258. )
  259. )})
  260. ),
  261. texts("div.structured-project__steps_1-0 > ol > li > p")
  262. )
  263. val greatist = Parser(
  264. text("h1"),
  265. elementList("article.article-body > ul > li").map(
  266. _.filter((listItem) => (listItem >?> text("strong")) == Some("Yield"))
  267. .map(_ >> text)
  268. .head
  269. ).map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
  270. // tflucke@[2023-11-28]: TODO They don't give passive, only Active + Total
  271. None,
  272. Some(
  273. elementList("article.article-body > ul > li").map(
  274. _.filter((listItem) => (listItem >?> text("strong")) == Some("Active"))
  275. .map(_ >> text)
  276. .head
  277. ).map("Active: \\D*(\\d+).*".r.findFirstMatchIn(_).fold("0")(_.group(1)))
  278. ),
  279. elementList("article.article-body > ul > li").map(_
  280. .filter((listItem) => (listItem >?> text("strong")) == None)
  281. .map(_ >> text)
  282. .map(_.replaceAll("\u00F1", "n"))
  283. .map(_parseIngredient _)
  284. ),
  285. texts("article.article-body > ol > li")
  286. )
  287. val dimitrasDishes = Parser(
  288. text("h2.mv-create-title-primary"),
  289. text("div.mv-create-time-yield > span").map(_.toFloatOption),
  290. None,
  291. None,
  292. texts("div.mv-create-ingredients > ul > li").map(
  293. _.map(_
  294. .replace("and", "")
  295. .replaceAll("\u00BD", "1/2")
  296. .replaceAll("\u00BC", "1/4")
  297. .replaceAll("\u00BE", "3/4")
  298. .replaceAll("\u2150", "1/7")
  299. .replaceAll("\u2151", "1/9")
  300. .replaceAll("\u2152", "1/10")
  301. .replaceAll("\u2153", "1/3")
  302. .replaceAll("\u2154", "2/3")
  303. .replaceAll("\u2155", "1/5")
  304. .replaceAll("\u2156", "2/5")
  305. .replaceAll("\u2157", "3/5")
  306. .replaceAll("\u2158", "4/5")
  307. .replaceAll("\u2159", "1/6")
  308. .replaceAll("\u215A", "5/6")
  309. .replaceAll("\u215B", "1/8")
  310. .replaceAll("\u215C", "3/8")
  311. .replaceAll("\u215D", "5/8")
  312. .replaceAll("\u215E", "7/8")
  313. .replaceAll("\u215F", "1/")
  314. .replaceAll("\u00F1", "n")
  315. .trim
  316. ).map(_parseIngredient _)
  317. ),
  318. texts("div.mv-create-instructions > ol > li")
  319. )
  320. val jif = Parser(
  321. text("h1.recipe-name"),
  322. elementList("div.recipe-breakdown-step").map(
  323. _.filter((listItem) => (listItem >?> text("i.servings")).isDefined)
  324. .map(_ >> text("span.recipe-breakdown-detail"))
  325. .head
  326. ).map(_.toFloatOption),
  327. Some(elementList("div.recipe-breakdown-step").map(
  328. _.filter((listItem) => (listItem >?> text("i.prep")).isDefined)
  329. .map(_ >> text("span.recipe-breakdown-detail"))
  330. .head
  331. )),
  332. Some(elementList("div.recipe-breakdown-step").map(
  333. _.filter((listItem) => (listItem >?> text("i.cook")).isDefined)
  334. .map(_ >> text("span.recipe-breakdown-detail"))
  335. .head
  336. )),
  337. texts("div.recipe-ingredients > ul > li")
  338. .map(_.map(_
  339. .replaceAll("\u00BD", "1/2")
  340. .replaceAll("\u00BC", "1/4")
  341. .replaceAll("\u00BE", "3/4")
  342. .replaceAll("\u2150", "1/7")
  343. .replaceAll("\u2151", "1/9")
  344. .replaceAll("\u2152", "1/10")
  345. .replaceAll("\u2153", "1/3")
  346. .replaceAll("\u2154", "2/3")
  347. .replaceAll("\u2155", "1/5")
  348. .replaceAll("\u2156", "2/5")
  349. .replaceAll("\u2157", "3/5")
  350. .replaceAll("\u2158", "4/5")
  351. .replaceAll("\u2159", "1/6")
  352. .replaceAll("\u215A", "5/6")
  353. .replaceAll("\u215B", "1/8")
  354. .replaceAll("\u215C", "3/8")
  355. .replaceAll("\u215D", "5/8")
  356. .replaceAll("\u215E", "7/8")
  357. .replaceAll("\u215F", "1/")
  358. .replaceAll("\u00F1", "n")
  359. .trim
  360. ))
  361. .map(_.map(_parseIngredient _)),
  362. texts("div.recipe-directions > ul > li > p")
  363. )
  364. val kingArthurBaking = Parser(
  365. text("h1 > span"),
  366. text("div.stat__item--yield > span").map(_.toFloatOption),
  367. Some(text("div.stat__item--prep > span")),
  368. Some(text("div.stat__item--bake > span")),
  369. texts("div.ingredient-section > ul > li")
  370. .map(_.map(_
  371. .replaceAll("\u00BD", "1/2")
  372. .replaceAll("\u00BC", "1/4")
  373. .replaceAll("\u00BE", "3/4")
  374. .replaceAll("\u2150", "1/7")
  375. .replaceAll("\u2151", "1/9")
  376. .replaceAll("\u2152", "1/10")
  377. .replaceAll("\u2153", "1/3")
  378. .replaceAll("\u2154", "2/3")
  379. .replaceAll("\u2155", "1/5")
  380. .replaceAll("\u2156", "2/5")
  381. .replaceAll("\u2157", "3/5")
  382. .replaceAll("\u2158", "4/5")
  383. .replaceAll("\u2159", "1/6")
  384. .replaceAll("\u215A", "5/6")
  385. .replaceAll("\u215B", "1/8")
  386. .replaceAll("\u215C", "3/8")
  387. .replaceAll("\u215D", "5/8")
  388. .replaceAll("\u215E", "7/8")
  389. .replaceAll("\u215F", "1/")
  390. .replaceAll("\u00F1", "n")
  391. .trim
  392. ))
  393. .map(_.map(_parseIngredient _)),
  394. texts("div.field field--recipe-steps > ol > li > p")
  395. )
  396. private def _parseFraction(fractionLine: String) = {
  397. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*".r
  398. val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)[\d-_]*".r
  399. fractionLine match {
  400. case fractionPattern(numerator, denominator) =>
  401. Some(numerator.toFloat/denominator.toFloat)
  402. case mixedFractionPattern(whole, numerator, denominator) =>
  403. Some(whole.toFloat + numerator.toFloat/denominator.toFloat)
  404. case _ => fractionLine.toFloatOption
  405. }
  406. }
  407. private def _parseIngredient(
  408. ingredientLine: String
  409. ): (Float, MeasureUnit, String) = {
  410. val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  411. val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r
  412. val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)\s(\w+)\s+(.+)".r
  413. ingredientLine match {
  414. case mixedFractionPattern(whole, numerator, denominator, unit, rest) =>
  415. (
  416. whole.toFloat + numerator.toFloat/denominator.toFloat,
  417. MeasureUnit.guessUnit(unit).getOrElse(Count),
  418. rest
  419. )
  420. case fractionPattern(numerator, denominator, unit, rest) =>
  421. (
  422. numerator.toFloat/denominator.toFloat,
  423. MeasureUnit.guessUnit(unit).getOrElse(Count),
  424. rest
  425. )
  426. case numberPattern(amount, unit, rest) =>
  427. (amount.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count), rest)
  428. case noUnitLine =>
  429. (1, Count, noUnitLine)
  430. }
  431. }
  432. }