| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533 |
- package com.weEat.controllers
- import com.weEat.shared.models._
- import javax.inject.{Inject,Singleton}
- import play.api.libs.json._
- import play.api.mvc._
- import scala.concurrent.Future
- import com.weEat.models.Authorization
- import scalaoauth2.provider.{AuthInfoRequest,OAuth2ProviderActionBuilders}
- import com.weEat.services.OAuth2Service
- import net.ruippeixotog.scalascraper.browser.JsoupBrowser
- import net.ruippeixotog.scalascraper.dsl.DSL._
- import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
- //import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
- import net.ruippeixotog.scalascraper.model.{Document,Element}
- import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
- import scala.util._
- import scala.concurrent.ExecutionContext
- @Singleton
- class ParserController @Inject()(
- val controllerComponents: ControllerComponents,
- oauth: OAuth2Service,
- usdaController: USDAController,
- foodController: FoodController
- ) extends BaseController
- with OAuth2ProviderActionBuilders {
- implicit val ec = scala.concurrent.ExecutionContext.global
- private val _browser = JsoupBrowser()
- def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
- val url = request.body
- val host = new java.net.URL(url).getAuthority()
- val hostNoWWW =
- if (host.startsWith("www.")) host.substring("www.".length) else host
- val doc = _browser.get(url)
- Parser(usdaController, foodController)(hostNoWWW, doc).transformWith {
- case Success(Some(parser)) => parser(usdaController, foodController)(doc, url)
- .map((food) => Ok(Json.toJson(food)))
- case Success(None) => Future.successful(NotFound(f"No parser found for host $hostNoWWW"))
- case Failure(e) => throw e
- }
- })
- }
- case class Parser(
- titleExtractor: HtmlExtractor[Element, String],
- servingExtractor: HtmlExtractor[Element, Option[Float]],
- prepTimeExtractor: Option[HtmlExtractor[Element, String]],
- cookTimeExtractor: Option[HtmlExtractor[Element, String]],
- ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
- instructionExtractor: HtmlExtractor[Element, Iterable[String]],
- )(usdaC: USDAController, foodC: FoodController) {
- implicit val ec = scala.concurrent.ExecutionContext.global
- def apply(doc: Document, url: String): Future[RecipeNodeNoId] = {
- Future({
- val title = doc >> titleExtractor
- val servings = (doc >?> servingExtractor).flatten
- val prepTime = prepTimeExtractor.flatMap(doc >?> _)
- val cookTime = cookTimeExtractor.flatMap(doc >?> _)
- val ingredients = doc >> ingredientExtractor
- val instructions = doc >> instructionExtractor
- Future.sequence(ingredients.map({
- case (amt, u, line) => _guessFoodFromStr(line).map(Ingredient(_, amt, u))
- }))
- .map((ingredients) => RecipeNodeNoId(
- title,
- servings.getOrElse(1.0f),
- 1.0f,
- UnitType.NUMBER,
- ingredients.toSeq,
- /* tflucke@[2023-10-26]: Do not pass along the instructions since this
- * could be a violation of the Recipe Author's copyright. */
- Nil, //instructions.toSeq,
- None,
- None,
- Some(url),
- None
- ))
- }).flatten
- }
- private def _guessFoodFromStr(
- foodLine: String
- ): Future[Ingredient.IngredientId] = {
- val foodLineFiltered = foodLine
- .filter(_ <= 0x7f)
- .filterNot(Set.from("!:/-").contains)
- searchFdcIndex(foodLineFiltered).transformWith {
- case Success(Some(ingredientId)) => Future.successful(ingredientId)
- case Success(None) => searchSelfIndex(foodLineFiltered)
- case Failure(e) => Future.failed(e)
- }
- }
- def searchFdcIndex(foodLine: String): Future[Option[Ingredient.IngredientId]] = {
- import gov.usda.nal.fdc.models.DataType._
- import gov.usda.nal.fdc.models.SearchResult
- usdaC.fdc.getFoodsSearch(foodLine, Seq(
- // Branded,
- Foundation, SRLegacy
- ), pageSize = Some(10))().flatMap({
- case SearchResult(_, _, _, _, Nil) => Future.successful(None)
- case SearchResult(_, _, _, _, foods) =>
- Future.sequence(
- foods.map((food) => foodC.getByFdcId(food.fdcId))
- ).map(_.flatten
- .headOption
- .fold[Ingredient.IngredientId](
- Ingredient.USDAId(foods.head.fdcId)
- )((foodNode) => Ingredient.FoodNodeId(foodNode._id))
- ).map(Some(_))
- }).recover {
- case e: com.tflucke.webroutes.HTTPException if e.statusCode == 500 =>
- println(s"USDA database failed to parse line: '$foodLine'")
- throw e
- }
- }
- def searchSelfIndex(foodLine: String): Future[Ingredient.IngredientId] = {
- foodC.findByName(foodLine)
- .transform {
- case Success(Nil) =>
- Failure(new NoSuchElementException(foodLine))
- case Success(foodNode::rest) =>
- Success(Ingredient.FoodNodeId(foodNode._id))
- case Success(_) => ???
- case Failure(e) => Failure(e)
- }
- }
- }
- object Parser {
- type ParserFn = (USDAController, FoodController) => Parser
- private val knownParsers = Map(
- ("epicurious.com" -> Parser.epicurious),
- ("mccormick.com" -> Parser.mccormick),
- ("recipetineats.com" -> Parser.recipeTinEats),
- ("mamalovestocook.com" -> Parser.recipeTinEats),
- ("soulfullymade.com" -> Parser.recipeTinEats),
- ("familycookierecipes.com" -> Parser.recipeTinEats),
- ("familyfreshmeals.com" -> Parser.recipeTinEats),
- ("handmadefarmhouse.com" -> Parser.recipeTinEats),
- ("tastesoflizzyt.com" -> Parser.recipeTinEats),
- ("omnivorescookbook.com" -> Parser.recipeTinEats),
- ("growforagecookferment.com" -> Parser.recipeTinEats),
- ("joyfoodsunshine.com" -> Parser.recipeTinEats),
- ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
- ("darngoodveggies.com" -> Parser.tastyRecipes),
- ("pickledplum.com" -> Parser.tastyRecipes),
- ("iheartvegetables.com" -> Parser.tastyRecipes),
- ("seriouseats.com" -> Parser.seriousEats),
- ("greatist.com" -> Parser.greatist),
- ("dimitrasdishes.com" -> Parser.mvCreate),
- ("jif.com" -> Parser.jif),
- ("kingarthurbaking.com" -> Parser.kingArthurBaking),
- ("tasteasianfood.com" -> Parser.mvCreate),
- ("lovefood.com" -> Parser.loveFood)
- )
- private val frequentParsers = Seq.from(
- knownParsers.values.groupMapReduce(p => p)(_ => 1)((a, b) => a + b)
- ).sortBy({ case (_, c) => -c })
- .map(p => p._1)
-
- def apply(usdaC: USDAController, foodC: FoodController)(
- url: String,
- doc: Document
- )(implicit ec: ExecutionContext): Future[Option[ParserFn]] = {
- knownParsers.get(url).fold[Future[Option[ParserFn]]](Future.find(frequentParsers.map { (pfn) =>
- // convert the ParserFn to a Future[ParserFn] based on if it's a match
- pfn(usdaC, foodC)(doc, url)
- .filter(_.ingredients.size > 0)
- .map((_) => pfn)
- })(_ => true)) { (parser) =>
- Future.successful(Some(parser))
- }
- }
- def mccormick: ParserFn = Parser(
- text("h1"),
- // TODO use extractors
- text(".main-title .count").map(_.toFloatOption),
- Some(text(".prep_time .first_content")),
- cookTimeExtractor = Some(text(".ingredients .first_content")),
- ingredientExtractor = texts(".recipe-about-list li").map(
- _.map(_parseIngredient _)
- ),
- texts(".instructions-main span.para")
- ) _
- def epicurious: ParserFn = Parser(
- text("h1"),
- text("""div[data-testid="IngredientList"] > p""")
- .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
- None,
- None,
- texts("""div[data-testid="IngredientList"] > div > div""").map(
- _.map(_parseIngredient _)
- ),
- texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
- ) _
- def recipeTinEats: ParserFn = Parser(
- text("h2.wprm-recipe-name"),
- text("span.wprm-recipe-servings").map(_.toFloatOption),
- Some(text("span.wprm-recipe-prep_time-minutes")),
- Some(text("span.wprm-recipe-cook_time-minutes")),
- elementList("li.wprm-recipe-ingredient").map(_.map({ (li) => (
- (li >?> text("span.wprm-recipe-ingredient-amount")
- .map(_
- .replaceAll("\u00BD", "1/2")
- .replaceAll("\u00BC", "1/4")
- .replaceAll("\u00BE", "3/4")
- .replaceAll("\u2150", "1/7")
- .replaceAll("\u2151", "1/9")
- .replaceAll("\u2152", "1/10")
- .replaceAll("\u2153", "1/3")
- .replaceAll("\u2154", "2/3")
- .replaceAll("\u2155", "1/5")
- .replaceAll("\u2156", "2/5")
- .replaceAll("\u2157", "3/5")
- .replaceAll("\u2158", "4/5")
- .replaceAll("\u2159", "1/6")
- .replaceAll("\u215A", "5/6")
- .replaceAll("\u215B", "1/8")
- .replaceAll("\u215C", "3/8")
- .replaceAll("\u215D", "5/8")
- .replaceAll("\u215E", "7/8")
- .replaceAll("\u215F", "1/")
- ))
- .flatMap(_parseFraction _)
- .getOrElse(0.0f),
- (li >?> text("span.wprm-recipe-ingredient-unit"))
- .flatMap(MeasureUnit.guessUnit _)
- .getOrElse(Count),
- li >> text("span.wprm-recipe-ingredient-name")
- .map(_.replaceAll("\u00F1", "n"))
- ) })),
- texts("div.wprm-recipe-instruction-text")
- ) _
- def mvCreate: ParserFn = Parser(
- text("*.mv-create-title-primary"),
- text("span.mv-create-nutrition-serving-size").map(_.toFloatOption),
- Some(text("div.mv-create-time-prep > span.mv-time-minutes")),
- Some(text("div.mv-create-time-active > span.mv-time-minutes")),
- texts("div.mv-create-ingredients > ul > li").map(
- _.map(_
- .replace("and", "")
- .replaceAll("\u00BD", "1/2")
- .replaceAll("\u00BC", "1/4")
- .replaceAll("\u00BE", "3/4")
- .replaceAll("\u2150", "1/7")
- .replaceAll("\u2151", "1/9")
- .replaceAll("\u2152", "1/10")
- .replaceAll("\u2153", "1/3")
- .replaceAll("\u2154", "2/3")
- .replaceAll("\u2155", "1/5")
- .replaceAll("\u2156", "2/5")
- .replaceAll("\u2157", "3/5")
- .replaceAll("\u2158", "4/5")
- .replaceAll("\u2159", "1/6")
- .replaceAll("\u215A", "5/6")
- .replaceAll("\u215B", "1/8")
- .replaceAll("\u215C", "3/8")
- .replaceAll("\u215D", "5/8")
- .replaceAll("\u215E", "7/8")
- .replaceAll("\u215F", "1/")
- .replaceAll("\u00F1", "n")
- .trim
- ).map(_parseIngredient _)
- ),
- texts("div.mv-create-instructions > ol > li")
- ) _
- def tastyRecipes: ParserFn = Parser(
- text("h2.tasty-recipes-title"),
- text("span.tasty-recipes-yield")
- .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
- Some(text("span.tasty-recipes-prep-time")),
- Some(text("span.tasty-recipes-cook-time")),
- elementList("div.tasty-recipes-ingredients-body > ul > li").map(
- _.map({(listItem) => (
- ((listItem >?> elementList("span"))
- .flatMap(_.lastOption)
- .fold(0.0f)((elm: Element) =>
- (elm >?> attr("data-amount"))
- .fold(0.0f)(_.toFloat)
- )
- ),
- (listItem >?> elementList("span"))
- .flatMap(_.lastOption)
- .fold[MeasureUnit](Gram)((elm: Element) =>
- (elm >?> attr("data-unit"))
- .flatMap(MeasureUnit.guessUnit _)
- .getOrElse(Count)
- ),
- (listItem >?> text("strong"))
- .filterNot(_.contains("optional"))
- .getOrElse(listItem.ownText)
- )})
- ),
- texts("div.tasty-recipes-instructions-body > ol > li")
- ) _
- def seriousEats: ParserFn = Parser(
- text("h2.recipe-decision-block__title"),
- text("div.recipe-serving > span > span.meta-text__data")
- .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
- //text("div.recipe-yield > span > span.meta-text__data")
- Some(text("div.prep-time > span > span.meta-text__data")),
- None, //Some(text("span.tasty-recipes-cook-time")),
- elementList("ul.structured-ingredients__list > li > p").map(
- _.map({(p) => (
- ((p >?> elementList("span"))
- .flatMap(_
- .filter((s) => (s >?> attr("data-ingredient-quantity")).isDefined)
- .lastOption
- .map(_ >> text)
- ).flatMap(_parseFraction _)
- .getOrElse(0.0f)
- ),
- ((p >?> elementList("span"))
- .flatMap(_
- .filter((s) => (s >?> attr("data-ingredient-unit")).isDefined)
- .lastOption
- .map(_ >> text)
- ).flatMap(MeasureUnit.guessUnit _)
- .getOrElse(Count)
- ),
- ((p >?> elementList("span"))
- .flatMap(_
- .filter((s) => (s >?> attr("data-ingredient-name")).isDefined)
- .headOption
- ).getOrElse(p).ownText
- )
- )})
- ),
- texts("div.structured-project__steps_1-0 > ol > li > p")
- ) _
- def greatist: ParserFn = Parser(
- text("h1"),
- elementList("article.article-body > ul > li").map(
- _.filter((listItem) => (listItem >?> text("strong")) == Some("Yield"))
- .map(_ >> text)
- .head
- ).map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
- // tflucke@[2023-11-28]: TODO They don't give passive, only Active + Total
- None,
- Some(
- elementList("article.article-body > ul > li").map(
- _.filter((listItem) => (listItem >?> text("strong")) == Some("Active"))
- .map(_ >> text)
- .head
- ).map("Active: \\D*(\\d+).*".r.findFirstMatchIn(_).fold("0")(_.group(1)))
- ),
- elementList("article.article-body > ul > li").map(_
- .filter((listItem) => (listItem >?> text("strong")) == None)
- .map(_ >> text)
- .map(_.replaceAll("\u00F1", "n"))
- .map(_parseIngredient _)
- ),
- texts("article.article-body > ol > li")
- ) _
- def jif: ParserFn = Parser(
- text("h1.recipe-name"),
- elementList("div.recipe-breakdown-step").map(
- _.filter((listItem) => (listItem >?> text("i.servings")).isDefined)
- .map(_ >> text("span.recipe-breakdown-detail"))
- .head
- ).map(_.toFloatOption),
- Some(elementList("div.recipe-breakdown-step").map(
- _.filter((listItem) => (listItem >?> text("i.prep")).isDefined)
- .map(_ >> text("span.recipe-breakdown-detail"))
- .head
- )),
- Some(elementList("div.recipe-breakdown-step").map(
- _.filter((listItem) => (listItem >?> text("i.cook")).isDefined)
- .map(_ >> text("span.recipe-breakdown-detail"))
- .head
- )),
- texts("div.recipe-ingredients > ul > li")
- .map(_.map(_
- .replaceAll("\u00BD", "1/2")
- .replaceAll("\u00BC", "1/4")
- .replaceAll("\u00BE", "3/4")
- .replaceAll("\u2150", "1/7")
- .replaceAll("\u2151", "1/9")
- .replaceAll("\u2152", "1/10")
- .replaceAll("\u2153", "1/3")
- .replaceAll("\u2154", "2/3")
- .replaceAll("\u2155", "1/5")
- .replaceAll("\u2156", "2/5")
- .replaceAll("\u2157", "3/5")
- .replaceAll("\u2158", "4/5")
- .replaceAll("\u2159", "1/6")
- .replaceAll("\u215A", "5/6")
- .replaceAll("\u215B", "1/8")
- .replaceAll("\u215C", "3/8")
- .replaceAll("\u215D", "5/8")
- .replaceAll("\u215E", "7/8")
- .replaceAll("\u215F", "1/")
- .replaceAll("\u00F1", "n")
- .trim
- ))
- .map(_.map(_parseIngredient _)),
- texts("div.recipe-directions > ul > li > p")
- ) _
- def kingArthurBaking: ParserFn = Parser(
- text("h1 > span"),
- text("div.stat__item--yield > span").map(_.toFloatOption),
- Some(text("div.stat__item--prep > span")),
- Some(text("div.stat__item--bake > span")),
- texts("div.ingredient-section > ul > li")
- .map(_.map(_
- .replaceAll("\u00BD", "1/2")
- .replaceAll("\u00BC", "1/4")
- .replaceAll("\u00BE", "3/4")
- .replaceAll("\u2150", "1/7")
- .replaceAll("\u2151", "1/9")
- .replaceAll("\u2152", "1/10")
- .replaceAll("\u2153", "1/3")
- .replaceAll("\u2154", "2/3")
- .replaceAll("\u2155", "1/5")
- .replaceAll("\u2156", "2/5")
- .replaceAll("\u2157", "3/5")
- .replaceAll("\u2158", "4/5")
- .replaceAll("\u2159", "1/6")
- .replaceAll("\u215A", "5/6")
- .replaceAll("\u215B", "1/8")
- .replaceAll("\u215C", "3/8")
- .replaceAll("\u215D", "5/8")
- .replaceAll("\u215E", "7/8")
- .replaceAll("\u215F", "1/")
- .replaceAll("\u00F1", "n")
- .trim
- ))
- .map(_.map(_parseIngredient _)),
- texts("div.field field--recipe-steps > ol > li > p")
- ) _
- def loveFood: ParserFn = Parser(
- text("h1.post__title"),
- elementList("div.layout__item.u-1/2-lap > ul > li").map(
- _.filter((listItem) => (listItem >?> text("strong")) == Some("Serves:"))
- .map(_ >> text)
- .head
- ).map("Serves: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
- Some(elementList("div.layout__item.u-1/2-lap > ul > li").map(
- _.filter((listItem) => (listItem >?> text("strong")) == Some("Preparation Time:"))
- .map(_ >> text)
- .head
- ).map("Preparation Time: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1)).getOrElse(""))),
- Some(elementList("div.layout__item.u-1/2-lap > ul > li").map(
- _.filter((listItem) => (listItem >?> text("strong")) == Some("Cooking Time:"))
- .map(_ >> text)
- .head
- ).map("Cooking Time: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1)).getOrElse(""))),
- texts("ul[name='ingredients-metric'] > li")
- .map(_.map(_
- .replaceAll("\u00BD", "1/2")
- .replaceAll("\u00BC", "1/4")
- .replaceAll("\u00BE", "3/4")
- .replaceAll("\u2150", "1/7")
- .replaceAll("\u2151", "1/9")
- .replaceAll("\u2152", "1/10")
- .replaceAll("\u2153", "1/3")
- .replaceAll("\u2154", "2/3")
- .replaceAll("\u2155", "1/5")
- .replaceAll("\u2156", "2/5")
- .replaceAll("\u2157", "3/5")
- .replaceAll("\u2158", "4/5")
- .replaceAll("\u2159", "1/6")
- .replaceAll("\u215A", "5/6")
- .replaceAll("\u215B", "1/8")
- .replaceAll("\u215C", "3/8")
- .replaceAll("\u215D", "5/8")
- .replaceAll("\u215E", "7/8")
- .replaceAll("\u215F", "1/")
- .replaceAll("\u00F1", "n")
- .trim
- ))
- .map(_.map(_parseIngredient _)),
- texts("div.content__step-by-step > ol > li")
- ) _
- private def _parseFraction(fractionLine: String) = {
- val fractionPattern = raw"(\d+)/(\d+)[\d-_]*".r
- val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)[\d-_]*".r
- fractionLine match {
- case fractionPattern(numerator, denominator) =>
- Some(numerator.toFloat/denominator.toFloat)
- case mixedFractionPattern(whole, numerator, denominator) =>
- Some(whole.toFloat + numerator.toFloat/denominator.toFloat)
- case _ => fractionLine.toFloatOption
- }
- }
- private def _parseIngredient(
- ingredientLine: String
- ): (Float, MeasureUnit, String) = {
- val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r
- val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r
- val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)\s(\w+)\s+(.+)".r
- ingredientLine match {
- case mixedFractionPattern(whole, numerator, denominator, unit, rest) =>
- (
- whole.toFloat + numerator.toFloat/denominator.toFloat,
- MeasureUnit.guessUnit(unit).getOrElse(Count),
- rest
- )
- case fractionPattern(numerator, denominator, unit, rest) =>
- (
- numerator.toFloat/denominator.toFloat,
- MeasureUnit.guessUnit(unit).getOrElse(Count),
- rest
- )
- case numberPattern(amount, unit, rest) =>
- (amount.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count), rest)
- case noUnitLine =>
- (1, Count, noUnitLine)
- }
- }
- }
|