Parcourir la source

Importer now makes a best effort attempt to guess the input format.

If a URL match is not found, it will go through the list of all known
parsers and take the first one that sucessfully parses an ingredient.

Could be better, but it seems to work well enough.
Thomas Flucke il y a 1 an
Parent
commit
febad0d38f
1 fichiers modifiés avec 108 ajouts et 77 suppressions
  1. 108 77
      server/app/com/weEat/controllers/ParserController.scala

+ 108 - 77
server/app/com/weEat/controllers/ParserController.scala

@@ -12,9 +12,10 @@ import net.ruippeixotog.scalascraper.browser.JsoupBrowser
 import net.ruippeixotog.scalascraper.dsl.DSL._
 import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
 //import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
-import net.ruippeixotog.scalascraper.model.Element
+import net.ruippeixotog.scalascraper.model.{Document,Element}
 import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
 import scala.util._
+import scala.concurrent.ExecutionContext
 
 @Singleton
 class ParserController @Inject()(
@@ -30,21 +31,42 @@ class ParserController @Inject()(
 
   def parseURL() = AuthorizedAction[Authorization](oauth).async(parse.text)({ implicit request: AuthInfoRequest[String, Authorization] =>
     val url = request.body
-    _findParser(url).fold(
-      Future.successful(NotFound(s"No parser available for $url."))
-    ) { (parser) =>
-      val doc = _browser.get(url)
-      val title = doc >> parser.titleExtractor
-      val servings = (doc >?> parser.servingExtractor).flatten
-      val prepTime = parser.prepTimeExtractor.flatMap(doc >?> _)
-      val cookTime = parser.cookTimeExtractor.flatMap(doc >?> _)
-      val ingredients = doc >> parser.ingredientExtractor
-      val instructions = doc >> parser.instructionExtractor
+    val host = new java.net.URL(url).getAuthority()
+    val hostNoWWW =
+      if (host.startsWith("www.")) host.substring("www.".length) else host
+    val doc = _browser.get(url)
+    Parser(usdaController, foodController)(hostNoWWW, doc).transformWith {
+      case Success(Some(parser)) => parser(usdaController, foodController)(doc, url)
+          .map((food) => Ok(Json.toJson(food)))
+      case Success(None) => Future.successful(NotFound(f"No parser found for host $hostNoWWW"))
+      case Failure(e) => throw e
+    }
+  })
+}
+
+case class Parser(
+  titleExtractor: HtmlExtractor[Element, String],
+  servingExtractor: HtmlExtractor[Element, Option[Float]],
+  prepTimeExtractor: Option[HtmlExtractor[Element, String]],
+  cookTimeExtractor: Option[HtmlExtractor[Element, String]],
+  ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
+  instructionExtractor: HtmlExtractor[Element, Iterable[String]],
+)(usdaC: USDAController, foodC: FoodController) {
+  implicit val ec = scala.concurrent.ExecutionContext.global
+
+  def apply(doc: Document, url: String): Future[RecipeNodeNoId] = {
+    Future({
+      val title = doc >> titleExtractor
+      val servings = (doc >?> servingExtractor).flatten
+      val prepTime = prepTimeExtractor.flatMap(doc >?> _)
+      val cookTime = cookTimeExtractor.flatMap(doc >?> _)
+      val ingredients = doc >> ingredientExtractor
+      val instructions = doc >> instructionExtractor
 
       Future.sequence(ingredients.map({
         case (amt, u, line) => _guessFoodFromStr(line).map(Ingredient(_, amt, u))
       }))
-        .map((ingredients) => Ok(Json.toJson(RecipeNodeNoId(
+        .map((ingredients) => RecipeNodeNoId(
           title,
           servings.getOrElse(1.0f),
           1.0f,
@@ -57,37 +79,8 @@ class ParserController @Inject()(
           None,
           Some(url),
           None
-        ))))
-    }
-  })
-
-  private def _findParser(url: String): Option[Parser] = {
-    val host = new java.net.URL(url).getAuthority()
-    val hostNoWWW =
-      if (host.startsWith("www.")) host.substring("www.".length) else host
-    Map(
-      ("epicurious.com" -> Parser.epicurious),
-      ("mccormick.com" -> Parser.mccormick),
-      ("recipetineats.com" -> Parser.recipeTinEats),
-      ("mamalovestocook.com" -> Parser.recipeTinEats),
-      ("soulfullymade.com" -> Parser.recipeTinEats),
-      ("familycookierecipes.com" -> Parser.recipeTinEats),
-      ("familyfreshmeals.com" -> Parser.recipeTinEats),
-      ("handmadefarmhouse.com" -> Parser.recipeTinEats),
-      ("tastesoflizzyt.com" -> Parser.recipeTinEats),
-      ("omnivorescookbook.com" -> Parser.recipeTinEats),
-      ("growforagecookferment.com" -> Parser.recipeTinEats),
-      ("joyfoodsunshine.com" -> Parser.recipeTinEats),
-      ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
-      ("darngoodveggies.com" -> Parser.tastyRecipes),
-      ("pickledplum.com" -> Parser.tastyRecipes),
-      ("iheartvegetables.com" -> Parser.tastyRecipes),
-      ("seriouseats.com" -> Parser.seriousEats),
-      ("greatist.com" -> Parser.greatist),
-      ("dimitrasdishes.com" -> Parser.dimitrasDishes),
-      ("jif.com" -> Parser.jif),
-      ("kingarthurbaking.com" -> Parser.kingArthurBaking)
-    ).get(hostNoWWW)
+        ))
+    }).flatten
   }
 
   private def _guessFoodFromStr(
@@ -95,9 +88,7 @@ class ParserController @Inject()(
   ): Future[Ingredient.IngredientId] = {
     val foodLineFiltered = foodLine
       .filter(_ <= 0x7f)
-      .filterNot(_ == '!')
-      .filterNot(_ == ':')
-      .filterNot(_ == '/')
+      .filterNot(Set.from("!:/-").contains)
     searchFdcIndex(foodLineFiltered).transformWith {
       case Success(Some(ingredientId)) => Future.successful(ingredientId)
       case Success(None) => searchSelfIndex(foodLineFiltered)
@@ -108,47 +99,87 @@ class ParserController @Inject()(
   def searchFdcIndex(foodLine: String): Future[Option[Ingredient.IngredientId]] = {
     import gov.usda.nal.fdc.models.DataType._
     import gov.usda.nal.fdc.models.SearchResult
-    usdaController.fdc.getFoodsSearch(foodLine, Seq(
+    usdaC.fdc.getFoodsSearch(foodLine, Seq(
       // Branded,
       Foundation, SRLegacy
     ), pageSize = Some(10))().flatMap({
       case SearchResult(_, _, _, _, Nil) => Future.successful(None)
       case SearchResult(_, _, _, _, foods) =>
         Future.sequence(
-          foods.map((food) => foodController.getByFdcId(food.fdcId))
+          foods.map((food) => foodC.getByFdcId(food.fdcId))
         ).map(_.flatten
           .headOption
           .fold[Ingredient.IngredientId](
             Ingredient.USDAId(foods.head.fdcId)
           )((foodNode) => Ingredient.FoodNodeId(foodNode._id))
         ).map(Some(_))
-    })
+    }).recover {
+      case e: com.tflucke.webroutes.HTTPException if e.statusCode == 500 =>
+        println(s"USDA database failed to parse line: '$foodLine'")
+        throw e
+    }
   }
 
   def searchSelfIndex(foodLine: String): Future[Ingredient.IngredientId] = {
-    foodController.findByName(foodLine)
+    foodC.findByName(foodLine)
       .transform {
         case Success(Nil) => 
           Failure(new NoSuchElementException(foodLine))
         case Success(foodNode::rest) =>
           Success(Ingredient.FoodNodeId(foodNode._id))
+        case Success(_) => ???
         case Failure(e) => Failure(e)
       }
   }
 }
 
+object Parser {
+  type ParserFn = (USDAController, FoodController) => Parser
 
-case class Parser(
-  titleExtractor: HtmlExtractor[Element, String],
-  servingExtractor: HtmlExtractor[Element, Option[Float]],
-  prepTimeExtractor: Option[HtmlExtractor[Element, String]],
-  cookTimeExtractor: Option[HtmlExtractor[Element, String]],
-  ingredientExtractor: HtmlExtractor[Element, Iterable[(Float, MeasureUnit, String)]],
-  instructionExtractor: HtmlExtractor[Element, Iterable[String]],
-)
+  private val knownParsers = Map(
+    ("epicurious.com" -> Parser.epicurious),
+    ("mccormick.com" -> Parser.mccormick),
+    ("recipetineats.com" -> Parser.recipeTinEats),
+    ("mamalovestocook.com" -> Parser.recipeTinEats),
+    ("soulfullymade.com" -> Parser.recipeTinEats),
+    ("familycookierecipes.com" -> Parser.recipeTinEats),
+    ("familyfreshmeals.com" -> Parser.recipeTinEats),
+    ("handmadefarmhouse.com" -> Parser.recipeTinEats),
+    ("tastesoflizzyt.com" -> Parser.recipeTinEats),
+    ("omnivorescookbook.com" -> Parser.recipeTinEats),
+    ("growforagecookferment.com" -> Parser.recipeTinEats),
+    ("joyfoodsunshine.com" -> Parser.recipeTinEats),
+    ("sallysbakingaddiction.com" -> Parser.tastyRecipes),
+    ("darngoodveggies.com" -> Parser.tastyRecipes),
+    ("pickledplum.com" -> Parser.tastyRecipes),
+    ("iheartvegetables.com" -> Parser.tastyRecipes),
+    ("seriouseats.com" -> Parser.seriousEats),
+    ("greatist.com" -> Parser.greatist),
+    ("dimitrasdishes.com" -> Parser.dimitrasDishes),
+    ("jif.com" -> Parser.jif),
+    ("kingarthurbaking.com" -> Parser.kingArthurBaking)
+  )
 
-object Parser {
-  val mccormick = Parser(
+  private val frequentParsers = Seq.from(
+    knownParsers.values.groupMapReduce(p => p)(_ => 1)((a, b) => a + b)
+  ).sortBy({ case (_, c) => -c })
+    .map(p => p._1)
+  
+  def apply(usdaC: USDAController, foodC: FoodController)(
+    url: String,
+    doc: Document
+  )(implicit ec: ExecutionContext): Future[Option[ParserFn]] = {
+    knownParsers.get(url).fold[Future[Option[ParserFn]]](Future.find(frequentParsers.map { (pfn) =>
+      // convert the ParserFn to a Future[ParserFn] based on if it's a match
+      pfn(usdaC, foodC)(doc, url)
+        .filter(_.ingredients.size > 0)
+        .map((_) => pfn)
+    })(_ => true)) { (parser) =>
+      Future.successful(Some(parser))
+    }
+  }
+
+  def mccormick: ParserFn = Parser(
     text("h1"),
     // TODO use extractors
     text(".main-title .count").map(_.toFloatOption),
@@ -158,9 +189,9 @@ object Parser {
       _.map(_parseIngredient _)
     ),
     texts(".instructions-main span.para")
-  )
+  ) _
 
-  val epicurious = Parser(
+  def epicurious: ParserFn = Parser(
     text("h1"),
     text("""div[data-testid="IngredientList"] > p""")
       .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
@@ -170,9 +201,9 @@ object Parser {
       _.map(_parseIngredient _)
     ),
     texts("""div[data-testid="InstructionsWrapper"] > ol > li > p""")
-  )
+  ) _
 
-  val recipeTinEats = Parser(
+  def recipeTinEats: ParserFn = Parser(
     text("h2.wprm-recipe-name"),
     text("span.wprm-recipe-servings").map(_.toFloatOption),
     Some(text("span.wprm-recipe-prep_time-minutes")),
@@ -209,9 +240,9 @@ object Parser {
         .map(_.replaceAll("\u00F1", "n"))
     ) })),
     texts("div.wprm-recipe-instruction-text")
-  )
+  ) _
 
-  val tastyRecipes = Parser(
+  def tastyRecipes: ParserFn = Parser(
     text("h2.tasty-recipes-title"),
     text("span.tasty-recipes-yield")
       .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
@@ -239,9 +270,9 @@ object Parser {
       )})
     ),
     texts("div.tasty-recipes-instructions-body > ol > li")
-  )
+  ) _
 
-  val seriousEats = Parser(
+  def seriousEats: ParserFn = Parser(
     text("h2.recipe-decision-block__title"),
     text("div.recipe-serving > span > span.meta-text__data")
       .map("\\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
@@ -275,9 +306,9 @@ object Parser {
       )})
     ),
     texts("div.structured-project__steps_1-0 > ol > li > p")
-  )
+  ) _
 
-  val greatist = Parser(
+  def greatist: ParserFn = Parser(
     text("h1"),
     elementList("article.article-body > ul > li").map(
         _.filter((listItem) => (listItem >?> text("strong")) == Some("Yield"))
@@ -300,9 +331,9 @@ object Parser {
       .map(_parseIngredient _)
     ),
     texts("article.article-body > ol > li")
-  )
+  ) _
 
-  val dimitrasDishes = Parser(
+  def dimitrasDishes: ParserFn = Parser(
     text("h2.mv-create-title-primary"),
     text("div.mv-create-time-yield > span").map(_.toFloatOption),
     None,
@@ -334,9 +365,9 @@ object Parser {
       ).map(_parseIngredient _)
     ),
     texts("div.mv-create-instructions > ol > li")
-  )
+  ) _
 
-  val jif = Parser(
+  def jif: ParserFn = Parser(
     text("h1.recipe-name"),
     elementList("div.recipe-breakdown-step").map(
       _.filter((listItem) => (listItem >?> text("i.servings")).isDefined)
@@ -379,9 +410,9 @@ object Parser {
       ))
       .map(_.map(_parseIngredient _)),
     texts("div.recipe-directions > ul > li > p")
-  )
+  ) _
 
-  val kingArthurBaking = Parser(
+  def kingArthurBaking: ParserFn = Parser(
     text("h1 > span"),
     text("div.stat__item--yield > span").map(_.toFloatOption),
     Some(text("div.stat__item--prep > span")),
@@ -412,7 +443,7 @@ object Parser {
       ))
       .map(_.map(_parseIngredient _)),
     texts("div.field field--recipe-steps > ol > li > p")
-  )
+  ) _
 
   private def _parseFraction(fractionLine: String) = {
     val fractionPattern = raw"(\d+)/(\d+)[\d-_]*".r