Sfoglia il codice sorgente

Fixed issues with parsing RecipeTinFoods recipes.

Thomas Flucke 2 anni fa
parent
commit
f632e51d31
1 ha cambiato i file con 54 aggiunte e 17 eliminazioni
  1. 54 17
      server/app/com/weEat/controllers/ParserController.scala

+ 54 - 17
server/app/com/weEat/controllers/ParserController.scala

@@ -11,6 +11,7 @@ import com.weEat.services.OAuth2Service
 import net.ruippeixotog.scalascraper.browser.JsoupBrowser
 import net.ruippeixotog.scalascraper.dsl.DSL._
 import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
+//import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
 import net.ruippeixotog.scalascraper.model.Element
 import net.ruippeixotog.scalascraper.scraper.HtmlExtractor
 import scala.util._
@@ -33,8 +34,8 @@ class ParserController @Inject()(
       val doc = _browser.get(url)
       val title = doc >> parser.titleExtractor
       val servings = doc >> parser.servingExtractor
-      val prepTime = parser.prepTimeExtractor.map(doc >> _)
-      val cookTime = parser.cookTimeExtractor.map(doc >> _)
+      val prepTime = parser.prepTimeExtractor.flatMap(doc >?> _)
+      val cookTime = parser.cookTimeExtractor.flatMap(doc >?> _)
       val ingredients = doc >> parser.ingredientExtractor
       val instructions = doc >> parser.instructionExtractor
 
@@ -61,7 +62,8 @@ class ParserController @Inject()(
     Map(
       ("epicurious.com" -> Parser.epicurious),
       ("mccormick.com" -> Parser.mccormick),
-      ("recipetineats.com" -> Parser.recipeTinEats)
+      ("recipetineats.com" -> Parser.recipeTinEats),
+      ("mamalovestocook.com" -> Parser.recipeTinEats)
     ).get(hostNoWWW)
   }
 
@@ -69,12 +71,19 @@ class ParserController @Inject()(
     val numberPattern = raw"(\d+)[\d-_]*\s(\w+)\s+(.+)".r
     val fractionPattern = raw"(\d+)/(\d+)[\d-_]*\s(\w+)\s+(.+)".r
 
-    //println(ingredientLine)
     ingredientLine match {
       case numberPattern(amount, unit, rest) =>
-        _guessFoodFromStr(rest).map(Ingredient(_, amount.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count)))
+        _guessFoodFromStr(rest).map(Ingredient(
+          _,
+          amount.toFloat,
+          MeasureUnit.guessUnit(unit).getOrElse(Count)
+        ))
       case fractionPattern(numerator, denominator, unit, rest) =>
-        _guessFoodFromStr(rest).map(Ingredient(_, numerator.toFloat/denominator.toFloat, MeasureUnit.guessUnit(unit).getOrElse(Count)))
+        _guessFoodFromStr(rest).map(Ingredient(
+          _,
+          numerator.toFloat/denominator.toFloat,
+          MeasureUnit.guessUnit(unit).getOrElse(Count)
+        ))
       case noUnitLine => _guessFoodFromStr(noUnitLine).map(Ingredient(_, 1, Count))
     }
 
@@ -82,17 +91,20 @@ class ParserController @Inject()(
 
   private def _guessFoodFromStr(foodLine: String): Future[Ingredient.IngredientId] = {
     import gov.usda.nal.fdc.models.DataType._
-    usdaController.fdc.getFoodsSearch(foodLine, Seq(
+    usdaController.fdc.getFoodsSearch(foodLine.filterNot(_ == '/'), Seq(
       Foundation, Survey, SRLegacy
     ), pageSize = Some(10))().flatMap({ (fdcResult) =>
-      Future.sequence(fdcResult.foods.map((food) => foodController.getByFdcId(food.fdcId)))
-        .map(_.flatten
-          .headOption
-          .fold[Ingredient.IngredientId](Ingredient.USDAId(fdcResult.foods.head.fdcId))((foodNode) => Ingredient.FoodNodeId(foodNode._id))
-        ).transform({
-          case Success(x) => Success(x)
-          case Failure(x) => println(foodLine);Failure(x)
-        })
+      Future.sequence(
+        fdcResult.foods.map((food) => foodController.getByFdcId(food.fdcId))
+      ).map(_.flatten
+        .headOption
+        .fold[Ingredient.IngredientId](
+          Ingredient.USDAId(fdcResult.foods.head.fdcId)
+        )((foodNode) => Ingredient.FoodNodeId(foodNode._id))
+      ).transform({
+        case Success(x) => Success(x)
+        case Failure(x) => println(foodLine);Failure(x)
+      })
     })
   }
 }
@@ -109,6 +121,7 @@ case class Parser(
 object Parser {
   val mccormick = Parser(
     text("h1"),
+    // TODO use extractors
     text(".main-title .count").map(_.toFloatOption),
     Some(text(".prep_time .first_content")),
     cookTimeExtractor = Some(text(".ingredients .first_content")),
@@ -117,7 +130,8 @@ object Parser {
   )
   val epicurious = Parser(
     text("h1"),
-    text("""div[data-testid="IngredientList"] > p""").map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
+    text("""div[data-testid="IngredientList"] > p""")
+      .map("Yield: \\D*(\\d+).*".r.findFirstMatchIn(_).map(_.group(1).toFloat)),
     None,
     None,
     texts("""div[data-testid="IngredientList"] > div > div"""),
@@ -128,7 +142,30 @@ object Parser {
     text("span.wprm-recipe-servings").map(_.toFloatOption),
     Some(text("span.wprm-recipe-prep_time-minutes")),
     Some(text("span.wprm-recipe-cook_time-minutes")),
-    texts("li.wprm-recipe-ingredient").map(_.map(_.filter(_ <= 0x7f).filterNot(_ == '/').trim)),
+    texts("li.wprm-recipe-ingredient")
+      .map(_.map(_
+        .replaceAll("\u00BD", "1/2")
+        .replaceAll("\u00BC", "1/4")
+        .replaceAll("\u00BE", "3/4")
+        .replaceAll("\u2150", "1/7")
+        .replaceAll("\u2151", "1/9")
+        .replaceAll("\u2152", "1/10")
+        .replaceAll("\u2153", "1/3")
+        .replaceAll("\u2154", "2/3")
+        .replaceAll("\u2155", "1/5")
+        .replaceAll("\u2156", "2/5")
+        .replaceAll("\u2157", "3/5")
+        .replaceAll("\u2158", "4/5")
+        .replaceAll("\u2159", "1/6")
+        .replaceAll("\u215A", "5/6")
+        .replaceAll("\u215B", "1/8")
+        .replaceAll("\u215C", "3/8")
+        .replaceAll("\u215D", "5/8")
+        .replaceAll("\u215E", "7/8")
+        .replaceAll("\u215F", "1/")
+        .filter(_ <= 0x7f)
+        .trim
+      )),
     texts("div.wprm-recipe-instruction-text")
   )
 }