|
|
@@ -12,9 +12,10 @@ import net.ruippeixotog.scalascraper.browser.JsoupBrowser
|
|
|
import net.ruippeixotog.scalascraper.dsl.DSL._
|
|
|
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
|
|
|
//import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
|
|
|
-import net.ruippeixotog.scalascraper.model.Element
|
|
|
+import net.ruippeixotog.scalascraper.model.{Document,Element}
|
|
|
import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
|
|
|
import scala.util._
|
|
|
+import scala.concurrent.ExecutionContext
|
|
|
|
|
|
@Singleton
|
|
|
class ParserController @Inject()(
|
|
|
@@ -30,21 +31,42 @@ class ParserController @Inject()(
|
|
|
|
|
|
def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
|
|
|
val url = request.body
|
|
|
- _findParser(url).fold(
|
|
|
- Future.successful(NotFound(s"No parser available for $url."))
|
|
|
- ) { (parser) =>
|
|
|
- val doc = _browser.get(url)
|
|
|
- val title = doc >> parser.titleExtractor
|
|
|
- val servings = (doc >?> parser.servingExtractor).flatten
|
|
|
- val prepTime = parser.prepTimeExtractor.flatMap(doc >?> _)
|
|
|
- val cookTime = parser.cookTimeExtractor.flatMap(doc >?> _)
|
|
|
- val ingredients = doc >> parser.ingredientExtractor
|
|
|
- val instructions = doc >> parser.instructionExtractor
|
|
|
+ val host = new java.net.URL(url).getAuthority()
|
|
|
+ val hostNoWWW =
|
|
|
+ if (host.startsWith("www.")) host.substring("www.".length) else host
|
|
|
+ val doc = _browser.get(url)
|
|
|
+ Parser(usdaController, foodController)(hostNoWWW, doc).transformWith {
|
|
|
+ case Success(Some(parser)) => parser(usdaController, foodController)(doc, url)
|
|
|
+ .map((food) => Ok(Json.toJson(food)))
|
|
|
+ case Success(None) => Future.successful(NotFound(f"No parser found for host $hostNoWWW"))
|
|
|
+ case Failure(e) => throw e
|
|
|
+ }
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+case class Parser(
|
|
|
+ titleExtractor: HtmlExtractor[Element, String],
|
|
|
+ servingExtractor: HtmlExtractor[Element, Option[Float]],
|
|
|
+ prepTimeExtractor: Option[HtmlExtractor[Element, String]],
|
|
|
+ cookTimeExtractor: Option[HtmlExtractor[Element, String]],
|
|
|
+ ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
|
|
|
+ instructionExtractor: HtmlExtractor[Element, Iterable[String]],
|
|
|
+)(usdaC: USDAController, foodC: FoodController) {
|
|
|
+ implicit val ec = scala.concurrent.ExecutionContext.global
|
|
|
+
|
|
|
+ def apply(doc: Document, url: String): Future[RecipeNodeNoId] = {
|
|
|
+ Future({
|
|
|
+ val title = doc >> titleExtractor
|
|
|
+ val servings = (doc >?> servingExtractor).flatten
|
|
|
+ val prepTime = prepTimeExtractor.flatMap(doc >?> _)
|
|
|
+ val cookTime = cookTimeExtractor.flatMap(doc >?> _)
|
|
|
+ val ingredients = doc >> ingredientExtractor
|
|
|
+ val instructions = doc >> instructionExtractor
|
|
|
|
|
|
Future.sequence(ingredients.map({
|
|
|
case (amt, u, line) => _guessFoodFromStr(line).map(Ingredient(_, amt, u))
|
|
|
}))
|
|
|
- .map((ingredients) => Ok(Json.toJson(RecipeNodeNoId(
|
|
|
+ .map((ingredients) => RecipeNodeNoId(
|
|
|
title,
|
|
|
servings.getOrElse(1.0f),
|
|
|
1.0f,
|
|
|
@@ -57,37 +79,8 @@ class ParserController @Inject()(
|
|
|
None,
|
|
|
Some(url),
|
|
|
None
|
|
|
- ))))
|
|
|
- }
|
|
|
- })
|
|
|
-
|
|
|
- private def _findParser(url: String): Option[Parser] = {
|
|
|
- val host = new java.net.URL(url).getAuthority()
|
|
|
- val hostNoWWW =
|
|
|
- if (host.startsWith("www.")) host.substring("www.".length) else host
|
|
|
- Map(
|
|
|
- ("epicurious.com" -> Parser.epicurious),
|
|
|
- ("mccormick.com" -> Parser.mccormick),
|
|
|
- ("recipetineats.com" -> Parser.recipeTinEats),
|
|
|
- ("mamalovestocook.com" -> Parser.recipeTinEats),
|
|
|
- ("soulfullymade.com" -> Parser.recipeTinEats),
|
|
|
- ("familycookierecipes.com" -> Parser.recipeTinEats),
|
|
|
- ("familyfreshmeals.com" -> Parser.recipeTinEats),
|
|
|
- ("handmadefarmhouse.com" -> Parser.recipeTinEats),
|
|
|
- ("tastesoflizzyt.com" -> Parser.recipeTinEats),
|
|
|
- ("omnivorescookbook.com" -> Parser.recipeTinEats),
|
|
|
- ("growforagecookferment.com" -> Parser.recipeTinEats),
|
|
|
- ("joyfoodsunshine.com" -> Parser.recipeTinEats),
|
|
|
- ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
|
|
|
- ("darngoodveggies.com" -> Parser.tastyRecipes),
|
|
|
- ("pickledplum.com" -> Parser.tastyRecipes),
|
|
|
- ("iheartvegetables.com" -> Parser.tastyRecipes),
|
|
|
- ("seriouseats.com" -> Parser.seriousEats),
|
|
|
- ("greatist.com" -> Parser.greatist),
|
|
|
- ("dimitrasdishes.com" -> Parser.dimitrasDishes),
|
|
|
- ("jif.com" -> Parser.jif),
|
|
|
- ("kingarthurbaking.com" -> Parser.kingArthurBaking)
|
|
|
- ).get(hostNoWWW)
|
|
|
+ ))
|
|
|
+ }).flatten
|
|
|
}
|
|
|
|
|
|
private def _guessFoodFromStr(
|
|
|
@@ -95,9 +88,7 @@ class ParserController @Inject()(
|
|
|
): Future[Ingredient.IngredientId] = {
|
|
|
val foodLineFiltered = foodLine
|
|
|
.filter(_ <= 0x7f)
|
|
|
- .filterNot(_ == '!')
|
|
|
- .filterNot(_ == ':')
|
|
|
- .filterNot(_ == '/')
|
|
|
+ .filterNot(Set.from("!:/-").contains)
|
|
|
searchFdcIndex(foodLineFiltered).transformWith {
|
|
|
case Success(Some(ingredientId)) => Future.successful(ingredientId)
|
|
|
case Success(None) => searchSelfIndex(foodLineFiltered)
|
|
|
@@ -108,47 +99,87 @@ class ParserController @Inject()(
|
|
|
def searchFdcIndex(foodLine: String): Future[Option[Ingredient.IngredientId]] = {
|
|
|
import gov.usda.nal.fdc.models.DataType._
|
|
|
import gov.usda.nal.fdc.models.SearchResult
|
|
|
- usdaController.fdc.getFoodsSearch(foodLine, Seq(
|
|
|
+ usdaC.fdc.getFoodsSearch(foodLine, Seq(
|
|
|
// Branded,
|
|
|
Foundation, SRLegacy
|
|
|
), pageSize = Some(10))().flatMap({
|
|
|
case SearchResult(_, _, _, _, Nil) => Future.successful(None)
|
|
|
case SearchResult(_, _, _, _, foods) =>
|
|
|
Future.sequence(
|
|
|
- foods.map((food) => foodController.getByFdcId(food.fdcId))
|
|
|
+ foods.map((food) => foodC.getByFdcId(food.fdcId))
|
|
|
).map(_.flatten
|
|
|
.headOption
|
|
|
.fold[Ingredient.IngredientId](
|
|
|
Ingredient.USDAId(foods.head.fdcId)
|
|
|
)((foodNode) => Ingredient.FoodNodeId(foodNode._id))
|
|
|
).map(Some(_))
|
|
|
- })
|
|
|
+ }).recover {
|
|
|
+ case e: com.tflucke.webroutes.HTTPException if e.statusCode == 500 =>
|
|
|
+ println(s"USDA database failed to parse line: '$foodLine'")
|
|
|
+ throw e
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
def searchSelfIndex(foodLine: String): Future[Ingredient.IngredientId] = {
|
|
|
- foodController.findByName(foodLine)
|
|
|
+ foodC.findByName(foodLine)
|
|
|
.transform {
|
|
|
case Success(Nil) =>
|
|
|
Failure(new NoSuchElementException(foodLine))
|
|
|
case Success(foodNode::rest) =>
|
|
|
Success(Ingredient.FoodNodeId(foodNode._id))
|
|
|
+ case Success(_) => ???
|
|
|
case Failure(e) => Failure(e)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+object Parser {
|
|
|
+ type ParserFn = (USDAController, FoodController) => Parser
|
|
|
|
|
|
-case class Parser(
|
|
|
- titleExtractor: HtmlExtractor[Element, String],
|
|
|
- servingExtractor: HtmlExtractor[Element, Option[Float]],
|
|
|
- prepTimeExtractor: Option[HtmlExtractor[Element, String]],
|
|
|
- cookTimeExtractor: Option[HtmlExtractor[Element, String]],
|
|
|
- ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
|
|
|
- instructionExtractor: HtmlExtractor[Element, Iterable[String]],
|
|
|
-)
|
|
|
+ private val knownParsers = Map(
|
|
|
+ ("epicurious.com" -> Parser.epicurious),
|
|
|
+ ("mccormick.com" -> Parser.mccormick),
|
|
|
+ ("recipetineats.com" -> Parser.recipeTinEats),
|
|
|
+ ("mamalovestocook.com" -> Parser.recipeTinEats),
|
|
|
+ ("soulfullymade.com" -> Parser.recipeTinEats),
|
|
|
+ ("familycookierecipes.com" -> Parser.recipeTinEats),
|
|
|
+ ("familyfreshmeals.com" -> Parser.recipeTinEats),
|
|
|
+ ("handmadefarmhouse.com" -> Parser.recipeTinEats),
|
|
|
+ ("tastesoflizzyt.com" -> Parser.recipeTinEats),
|
|
|
+ ("omnivorescookbook.com" -> Parser.recipeTinEats),
|
|
|
+ ("growforagecookferment.com" -> Parser.recipeTinEats),
|
|
|
+ ("joyfoodsunshine.com" -> Parser.recipeTinEats),
|
|
|
+ ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
|
|
|
+ ("darngoodveggies.com" -> Parser.tastyRecipes),
|
|
|
+ ("pickledplum.com" -> Parser.tastyRecipes),
|
|
|
+ ("iheartvegetables.com" -> Parser.tastyRecipes),
|
|
|
+ ("seriouseats.com" -> Parser.seriousEats),
|
|
|
+ ("greatist.com" -> Parser.greatist),
|
|
|
+ ("dimitrasdishes.com" -> Parser.dimitrasDishes),
|
|
|
+ ("jif.com" -> Parser.jif),
|
|
|
+ ("kingarthurbaking.com" -> Parser.kingArthurBaking)
|
|
|
+ )
|
|
|
|
|
|
-object Parser {
|
|
|
- val mccormick = Parser(
|
|
|
+ private val frequentParsers = Seq.from(
|
|
|
+ knownParsers.values.groupMapReduce(p => p)(_ => 1)((a, b) => a + b)
|
|
|
+ ).sortBy({ case (_, c) => -c })
|
|
|
+ .map(p => p._1)
|
|
|
+
|
|
|
+ def apply(usdaC: USDAController, foodC: FoodController)(
|
|
|
+ url: String,
|
|
|
+ doc: Document
|
|
|
+ )(implicit ec: ExecutionContext): Future[Option[ParserFn]] = {
|
|
|
+ knownParsers.get(url).fold[Future[Option[ParserFn]]](Future.find(frequentParsers.map { (pfn) =>
|
|
|
+ // convert the ParserFn to a Future[ParserFn] based on if it's a match
|
|
|
+ pfn(usdaC, foodC)(doc, url)
|
|
|
+ .filter(_.ingredients.size > 0)
|
|
|
+ .map((_) => pfn)
|
|
|
+ })(_ => true)) { (parser) =>
|
|
|
+ Future.successful(Some(parser))
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ def mccormick: ParserFn = Parser(
|
|
|
text("h1"),
|
|
|
// TODO use extractors
|
|
|
text(".main-title .count").map(_.toFloatOption),
|
|
|
@@ -158,9 +189,9 @@ object Parser {
|
|
|
_.map(_parseIngredient _)
|
|
|
),
|
|
|
texts(".instructions-main span.para")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val epicurious = Parser(
|
|
|
+ def epicurious: ParserFn = Parser(
|
|
|
text("h1"),
|
|
|
text("""div[data-testid="IngredientList"] > p""")
|
|
|
.map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
|
|
|
@@ -170,9 +201,9 @@ object Parser {
|
|
|
_.map(_parseIngredient _)
|
|
|
),
|
|
|
texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val recipeTinEats = Parser(
|
|
|
+ def recipeTinEats: ParserFn = Parser(
|
|
|
text("h2.wprm-recipe-name"),
|
|
|
text("span.wprm-recipe-servings").map(_.toFloatOption),
|
|
|
Some(text("span.wprm-recipe-prep_time-minutes")),
|
|
|
@@ -209,9 +240,9 @@ object Parser {
|
|
|
.map(_.replaceAll("\u00F1", "n"))
|
|
|
) })),
|
|
|
texts("div.wprm-recipe-instruction-text")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val tastyRecipes = Parser(
|
|
|
+ def tastyRecipes: ParserFn = Parser(
|
|
|
text("h2.tasty-recipes-title"),
|
|
|
text("span.tasty-recipes-yield")
|
|
|
.map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
|
|
|
@@ -239,9 +270,9 @@ object Parser {
|
|
|
)})
|
|
|
),
|
|
|
texts("div.tasty-recipes-instructions-body > ol > li")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val seriousEats = Parser(
|
|
|
+ def seriousEats: ParserFn = Parser(
|
|
|
text("h2.recipe-decision-block__title"),
|
|
|
text("div.recipe-serving > span > span.meta-text__data")
|
|
|
.map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
|
|
|
@@ -275,9 +306,9 @@ object Parser {
|
|
|
)})
|
|
|
),
|
|
|
texts("div.structured-project__steps_1-0 > ol > li > p")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val greatist = Parser(
|
|
|
+ def greatist: ParserFn = Parser(
|
|
|
text("h1"),
|
|
|
elementList("article.article-body > ul > li").map(
|
|
|
_.filter((listItem) => (listItem >?> text("strong")) == Some("Yield"))
|
|
|
@@ -300,9 +331,9 @@ object Parser {
|
|
|
.map(_parseIngredient _)
|
|
|
),
|
|
|
texts("article.article-body > ol > li")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val dimitrasDishes = Parser(
|
|
|
+ def dimitrasDishes: ParserFn = Parser(
|
|
|
text("h2.mv-create-title-primary"),
|
|
|
text("div.mv-create-time-yield > span").map(_.toFloatOption),
|
|
|
None,
|
|
|
@@ -334,9 +365,9 @@ object Parser {
|
|
|
).map(_parseIngredient _)
|
|
|
),
|
|
|
texts("div.mv-create-instructions > ol > li")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val jif = Parser(
|
|
|
+ def jif: ParserFn = Parser(
|
|
|
text("h1.recipe-name"),
|
|
|
elementList("div.recipe-breakdown-step").map(
|
|
|
_.filter((listItem) => (listItem >?> text("i.servings")).isDefined)
|
|
|
@@ -379,9 +410,9 @@ object Parser {
|
|
|
))
|
|
|
.map(_.map(_parseIngredient _)),
|
|
|
texts("div.recipe-directions > ul > li > p")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
- val kingArthurBaking = Parser(
|
|
|
+ def kingArthurBaking: ParserFn = Parser(
|
|
|
text("h1 > span"),
|
|
|
text("div.stat__item--yield > span").map(_.toFloatOption),
|
|
|
Some(text("div.stat__item--prep > span")),
|
|
|
@@ -412,7 +443,7 @@ object Parser {
|
|
|
))
|
|
|
.map(_.map(_parseIngredient _)),
|
|
|
texts("div.field field--recipe-steps > ol > li > p")
|
|
|
- )
|
|
|
+ ) _
|
|
|
|
|
|
private def _parseFraction(fractionLine: String) = {
|
|
|
val fractionPattern = raw"(\d+)/(\d+)[\d-_]*".r
|