package com.weEat.controllers import com.weEat.shared.models._ import javax.inject.{Inject,Singleton} import play.api.libs.json._ import play.api.mvc._ import scala.concurrent.Future import com.weEat.models.Authorization import scalaoauth2.provider.{AuthInfoRequest,OAuth2ProviderActionBuilders} import com.weEat.services.OAuth2Service import net.ruippeixotog.scalascraper.browser.JsoupBrowser import net.ruippeixotog.scalascraper.dsl.DSL._ import net.ruippeixotog.scalascraper.dsl.DSL.Extract._ //import net.ruippeixotog.scalascraper.dsl.DSL.Parse._ import net.ruippeixotog.scalascraper.model.Element import net.ruippeixotog.scalascraper.scraper.HtmlExtractor import scala.util._ @Singleton class ParserController @Inject()( val controllerComponents: ControllerComponents, oauth: OAuth2Service, usdaController: USDAController, foodController: FoodController ) extends BaseController with OAuth2ProviderActionBuilders { implicit val ec = scala.concurrent.ExecutionContext.global private val _browser = JsoupBrowser() def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] => val url = request.body _findParser(url).fold( Future.successful(NotFound(s"No parser available for $url.")) ) { (parser) => val doc = _browser.get(url) val title = doc >> parser.titleExtractor val servings = (doc >?> parser.servingExtractor).flatten val prepTime = parser.prepTimeExtractor.flatMap(doc >?> _) val cookTime = parser.cookTimeExtractor.flatMap(doc >?> _) val ingredients = doc >> parser.ingredientExtractor val instructions = doc >> parser.instructionExtractor Future.sequence(ingredients.map({ case (amt, u, line) => _guessFoodFromStr(line).map(Ingredient(_, amt, u)) })) .map((ingredients) => Ok(Json.toJson(RecipeNodeNoId( title, servings.getOrElse(1.0f), 1.0f, UnitType.NUMBER, ingredients.toSeq, /* tflucke@[2023-10-26]: Do not pass along the instructions since this * could be a violation of the Recipe Author's copyright. */ Nil, //instructions.toSeq, None, None, Some(url), None )))) } }) private def _findParser(url: String): Option[Parser] = { val host = new java.net.URL(url).getAuthority() val hostNoWWW = if (host.startsWith("www.")) host.substring("www.".length) else host Map( ("epicurious.com" -> Parser.epicurious), ("mccormick.com" -> Parser.mccormick), ("recipetineats.com" -> Parser.recipeTinEats), ("mamalovestocook.com" -> Parser.recipeTinEats), ("soulfullymade.com" -> Parser.recipeTinEats), ("familycookierecipes.com" -> Parser.recipeTinEats), ("familyfreshmeals.com" -> Parser.recipeTinEats), ("handmadefarmhouse.com" -> Parser.recipeTinEats), ("tastesoflizzyt.com" -> Parser.recipeTinEats), ("omnivorescookbook.com" -> Parser.recipeTinEats), ("growforagecookferment.com" -> Parser.recipeTinEats), ("joyfoodsunshine.com" -> Parser.recipeTinEats), ("sallysbakingaddiction.com" -> Parser.tastyRecipes), ("darngoodveggies.com" -> Parser.tastyRecipes), ("pickledplum.com" -> Parser.tastyRecipes), ("iheartvegetables.com" -> Parser.tastyRecipes), ("seriouseats.com" -> Parser.seriousEats), ("greatist.com" -> Parser.greatist), ("dimitrasdishes.com" -> Parser.dimitrasDishes), ("jif.com" -> Parser.jif), ("kingarthurbaking.com" -> Parser.kingArthurBaking) ).get(hostNoWWW) } private def _guessFoodFromStr( foodLine: String ): Future[Ingredient.IngredientId] = { val foodLineFiltered = foodLine .filter(_ <= 0x7f) .filterNot(_ == '!') .filterNot(_ == ':') .filterNot(_ == '/') searchFdcIndex(foodLineFiltered).transformWith { case Success(Some(ingredientId)) => Future.successful(ingredientId) case Success(None) => searchSelfIndex(foodLineFiltered) case Failure(e) => Future.failed(e) } } def searchFdcIndex(foodLine: String): Future[Option[Ingredient.IngredientId]] = { import gov.usda.nal.fdc.models.DataType._ import gov.usda.nal.fdc.models.SearchResult usdaController.fdc.getFoodsSearch(foodLine, Seq( // Branded, Foundation, SRLegacy ), pageSize = Some(10))().flatMap({ case SearchResult(_, _, _, _, Nil) => Future.successful(None) case SearchResult(_, _, _, _, foods) => Future.sequence( foods.map((food) => foodController.getByFdcId(food.fdcId)) ).map(_.flatten .headOption .fold[Ingredient.IngredientId]( Ingredient.USDAId(foods.head.fdcId) )((foodNode) => Ingredient.FoodNodeId(foodNode._id)) ).map(Some(_)) }) } def searchSelfIndex(foodLine: String): Future[Ingredient.IngredientId] = { foodController.findByName(foodLine) .transform { case Success(Nil) => Failure(new NoSuchElementException(foodLine)) case Success(foodNode::rest) => Success(Ingredient.FoodNodeId(foodNode._id)) case Failure(e) => Failure(e) } } } case class Parser( titleExtractor: HtmlExtractor[Element, String], servingExtractor: HtmlExtractor[Element, Option[Float]], prepTimeExtractor: Option[HtmlExtractor[Element, String]], cookTimeExtractor: Option[HtmlExtractor[Element, String]], ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]], instructionExtractor: HtmlExtractor[Element, Iterable[String]], ) object Parser { val mccormick = Parser( text("h1"), // TODO use extractors text(".main-title .count").map(_.toFloatOption), Some(text(".prep_time .first_content")), cookTimeExtractor = Some(text(".ingredients .first_content")), ingredientExtractor = texts(".recipe-about-list li").map( _.map(_parseIngredient _) ), texts(".instructions-main span.para") ) val epicurious = Parser( text("h1"), text("""div[data-testid="IngredientList"] > p""") .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)), None, None, texts("""div[data-testid="IngredientList"] > div > div""").map( _.map(_parseIngredient _) ), texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""") ) val recipeTinEats = Parser( text("h2.wprm-recipe-name"), text("span.wprm-recipe-servings").map(_.toFloatOption), Some(text("span.wprm-recipe-prep_time-minutes")), Some(text("span.wprm-recipe-cook_time-minutes")), elementList("li.wprm-recipe-ingredient").map(_.map({ (li) => ( (li >?> text("span.wprm-recipe-ingredient-amount") .map(_ .replaceAll("\u00BD", "1/2") .replaceAll("\u00BC", "1/4") .replaceAll("\u00BE", "3/4") .replaceAll("\u2150", "1/7") .replaceAll("\u2151", "1/9") .replaceAll("\u2152", "1/10") .replaceAll("\u2153", "1/3") .replaceAll("\u2154", "2/3") .replaceAll("\u2155", "1/5") .replaceAll("\u2156", "2/5") .replaceAll("\u2157", "3/5") .replaceAll("\u2158", "4/5") .replaceAll("\u2159", "1/6") .replaceAll("\u215A", "5/6") .replaceAll("\u215B", "1/8") .replaceAll("\u215C", "3/8") .replaceAll("\u215D", "5/8") .replaceAll("\u215E", "7/8") .replaceAll("\u215F", "1/") )) .flatMap(_parseFraction _) .getOrElse(0.0f), (li >?> text("span.wprm-recipe-ingredient-unit")) .flatMap(MeasureUnit.guessUnit _) .getOrElse(Count), li >> text("span.wprm-recipe-ingredient-name") .map(_.replaceAll("\u00F1", "n")) ) })), texts("div.wprm-recipe-instruction-text") ) val tastyRecipes = Parser( text("h2.tasty-recipes-title"), text("span.tasty-recipes-yield") .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)), Some(text("span.tasty-recipes-prep-time")), Some(text("span.tasty-recipes-cook-time")), elementList("div.tasty-recipes-ingredients-body > ul > li").map( _.map({(listItem) => ( ((listItem >?> elementList("span")) .flatMap(_.lastOption) .fold(0.0f)((elm: Element) => (elm >?> attr("data-amount")) .fold(0.0f)(_.toFloat) ) ), (listItem >?> elementList("span")) .flatMap(_.lastOption) .fold[MeasureUnit](Gram)((elm: Element) => (elm >?> attr("data-unit")) .flatMap(MeasureUnit.guessUnit _) .getOrElse(Count) ), (listItem >?> text("strong")) .filterNot(_.contains("optional")) .getOrElse(listItem.ownText) )}) ), texts("div.tasty-recipes-instructions-body > ol > li") ) val seriousEats = Parser( text("h2.recipe-decision-block__title"), text("div.recipe-serving > span > span.meta-text__data") .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)), //text("div.recipe-yield > span > span.meta-text__data") Some(text("div.prep-time > span > span.meta-text__data")), None, //Some(text("span.tasty-recipes-cook-time")), elementList("ul.structured-ingredients__list > li > p").map( _.map({(p) => ( ((p >?> elementList("span")) .flatMap(_ .filter((s) => (s >?> attr("data-ingredient-quantity")).isDefined) .lastOption .map(_ >> text) ).flatMap(_parseFraction _) .getOrElse(0.0f) ), ((p >?> elementList("span")) .flatMap(_ .filter((s) => (s >?> attr("data-ingredient-unit")).isDefined) .lastOption .map(_ >> text) ).flatMap(MeasureUnit.guessUnit _) .getOrElse(Count) ), ((p >?> elementList("span")) .flatMap(_ .filter((s) => (s >?> attr("data-ingredient-name")).isDefined) .headOption ).getOrElse(p).ownText ) )}) ), texts("div.structured-project__steps_1-0 > ol > li > p") ) val greatist = Parser( text("h1"), elementList("article.article-body > ul > li").map( _.filter((listItem) => (listItem >?> text("strong")) == Some("Yield")) .map(_ >> text) .head ).map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)), // tflucke@[2023-11-28]: TODO They don't give passive, only Active + Total None, Some( elementList("article.article-body > ul > li").map( _.filter((listItem) => (listItem >?> text("strong")) == Some("Active")) .map(_ >> text) .head ).map("Active: \\D*(\\d+).*".r.findFirstMatchIn(_).fold("0")(_.group(1))) ), elementList("article.article-body > ul > li").map(_ .filter((listItem) => (listItem >?> text("strong")) == None) .map(_ >> text) .map(_.replaceAll("\u00F1", "n")) .map(_parseIngredient _) ), texts("article.article-body > ol > li") ) val dimitrasDishes = Parser( text("h2.mv-create-title-primary"), text("div.mv-create-time-yield > span").map(_.toFloatOption), None, None, texts("div.mv-create-ingredients > ul > li").map( _.map(_ .replace("and", "") .replaceAll("\u00BD", "1/2") .replaceAll("\u00BC", "1/4") .replaceAll("\u00BE", "3/4") .replaceAll("\u2150", "1/7") .replaceAll("\u2151", "1/9") .replaceAll("\u2152", "1/10") .replaceAll("\u2153", "1/3") .replaceAll("\u2154", "2/3") .replaceAll("\u2155", "1/5") .replaceAll("\u2156", "2/5") .replaceAll("\u2157", "3/5") .replaceAll("\u2158", "4/5") .replaceAll("\u2159", "1/6") .replaceAll("\u215A", "5/6") .replaceAll("\u215B", "1/8") .replaceAll("\u215C", "3/8") .replaceAll("\u215D", "5/8") .replaceAll("\u215E", "7/8") .replaceAll("\u215F", "1/") .replaceAll("\u00F1", "n") .trim ).map(_parseIngredient _) ), texts("div.mv-create-instructions > ol > li") ) val jif = Parser( text("h1.recipe-name"), elementList("div.recipe-breakdown-step").map( _.filter((listItem) => (listItem >?> text("i.servings")).isDefined) .map(_ >> text("span.recipe-breakdown-detail")) .head ).map(_.toFloatOption), Some(elementList("div.recipe-breakdown-step").map( _.filter((listItem) => (listItem >?> text("i.prep")).isDefined) .map(_ >> text("span.recipe-breakdown-detail")) .head )), Some(elementList("div.recipe-breakdown-step").map( _.filter((listItem) => (listItem >?> text("i.cook")).isDefined) .map(_ >> text("span.recipe-breakdown-detail")) .head )), texts("div.recipe-ingredients > ul > li") .map(_.map(_ .replaceAll("\u00BD", "1/2") .replaceAll("\u00BC", "1/4") .replaceAll("\u00BE", "3/4") .replaceAll("\u2150", "1/7") .replaceAll("\u2151", "1/9") .replaceAll("\u2152", "1/10") .replaceAll("\u2153", "1/3") .replaceAll("\u2154", "2/3") .replaceAll("\u2155", "1/5") .replaceAll("\u2156", "2/5") .replaceAll("\u2157", "3/5") .replaceAll("\u2158", "4/5") .replaceAll("\u2159", "1/6") .replaceAll("\u215A", "5/6") .replaceAll("\u215B", "1/8") .replaceAll("\u215C", "3/8") .replaceAll("\u215D", "5/8") .replaceAll("\u215E", "7/8") .replaceAll("\u215F", "1/") .replaceAll("\u00F1", "n") .trim )) .map(_.map(_parseIngredient _)), texts("div.recipe-directions > ul > li > p") ) val kingArthurBaking = Parser( text("h1 > span"), text("div.stat__item--yield > span").map(_.toFloatOption), Some(text("div.stat__item--prep > span")), Some(text("div.stat__item--bake > span")), texts("div.ingredient-section > ul > li") .map(_.map(_ .replaceAll("\u00BD", "1/2") .replaceAll("\u00BC", "1/4") .replaceAll("\u00BE", "3/4") .replaceAll("\u2150", "1/7") .replaceAll("\u2151", "1/9") .replaceAll("\u2152", "1/10") .replaceAll("\u2153", "1/3") .replaceAll("\u2154", "2/3") .replaceAll("\u2155", "1/5") .replaceAll("\u2156", "2/5") .replaceAll("\u2157", "3/5") .replaceAll("\u2158", "4/5") .replaceAll("\u2159", "1/6") .replaceAll("\u215A", "5/6") .replaceAll("\u215B", "1/8") .replaceAll("\u215C", "3/8") .replaceAll("\u215D", "5/8") .replaceAll("\u215E", "7/8") .replaceAll("\u215F", "1/") .replaceAll("\u00F1", "n") .trim )) .map(_.map(_parseIngredient _)), texts("div.field field--recipe-steps > ol > li > p") ) private def _parseFraction(fractionLine: String) = { val fractionPattern = raw"(\d+)/(\d+)[\d-_]*".r val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)[\d-_]*".r fractionLine match { case fractionPattern(numerator, denominator) => Some(numerator.toFloat/denominator.toFloat) case mixedFractionPattern(whole, numerator, denominator) => Some(whole.toFloat + numerator.toFloat/denominator.toFloat) case _ => fractionLine.toFloatOption } } private def _parseIngredient( ingredientLine: String ): (Float, MeasureUnit, String) = { val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r val mixedFractionPattern = raw"(\d+)\w+(\d+)/(\d+)\s(\w+)\s+(.+)".r ingredientLine match { case mixedFractionPattern(whole, numerator, denominator, unit, rest) => ( whole.toFloat + numerator.toFloat/denominator.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count), rest ) case fractionPattern(numerator, denominator, unit, rest) => ( numerator.toFloat/denominator.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count), rest ) case numberPattern(amount, unit, rest) => (amount.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count), rest) case noUnitLine => (1, Count, noUnitLine) } } }