feat: Improve Recipe Imports with Cleaner (#4517)

mealie-recipes · Nov 13, 2024 · bcd0fcc · bcd0fcc
1 parent 085c489
commit bcd0fcc
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 11 deletions.
diff --git a/mealie/services/migrations/_migration_base.py b/mealie/services/migrations/_migration_base.py
@@ -268,6 +268,5 @@ def clean_recipe_dictionary(self, recipe_dict: dict) -> Recipe:
         with contextlib.suppress(KeyError):
             del recipe_dict["id"]
 
-        recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
-
-        return Recipe(**recipe_dict)
+        recipe = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
+        return recipe
diff --git a/mealie/services/recipe/recipe_service.py b/mealie/services/recipe/recipe_service.py
@@ -32,6 +32,7 @@
 from mealie.services._base_service import BaseService
 from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService
 from mealie.services.recipe.recipe_data_service import RecipeDataService
+from mealie.services.scraper import cleaner
 
 from .template_service import TemplateService
 
@@ -297,6 +298,7 @@ async def create_from_images(self, images: list[UploadFile], translate_language:
             recipe_data = await openai_recipe_service.build_recipe_from_images(
                 local_images, translate_language=translate_language
             )
+            recipe_data = cleaner.clean(recipe_data, self.translator)
 
             recipe = self.create_one(recipe_data)
             data_service = RecipeDataService(recipe.id)

diff --git a/mealie/services/scraper/cleaner.py b/mealie/services/scraper/cleaner.py
@@ -11,6 +11,7 @@
 
 from mealie.core.root_logger import get_logger
 from mealie.lang.providers import Translator
+from mealie.schema.recipe.recipe import Recipe
 
 logger = get_logger("recipe-scraper")
 
@@ -33,16 +34,23 @@
 """ Matches multiple new lines and removes erroneous white space """
 
 
-def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
+def clean(recipe_data: Recipe | dict, translator: Translator, url=None) -> Recipe:
     """Main entrypoint to clean a recipe extracted from the web
     and format the data into an accectable format for the database
 
     Args:
-        recipe_data (dict): raw recipe dicitonary
+        recipe_data (dict): raw recipe or recipe dictionary
 
     Returns:
         dict: cleaned recipe dictionary
     """
+    if not isinstance(recipe_data, dict):
+        # format the recipe like a scraped dictionary
+        recipe_data_dict = recipe_data.model_dump(by_alias=True)
+        recipe_data_dict["recipeIngredient"] = [ing.display for ing in recipe_data.recipe_ingredient]
+
+        recipe_data = recipe_data_dict
+
     recipe_data["description"] = clean_string(recipe_data.get("description", ""))
 
     # Times
@@ -59,7 +67,7 @@ def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
     recipe_data["notes"] = clean_notes(recipe_data.get("notes"))
     recipe_data["rating"] = clean_int(recipe_data.get("rating"))
 
-    return recipe_data
+    return Recipe(**recipe_data)
 
 
 def clean_string(text: str | list | int) -> str:

diff --git a/mealie/services/scraper/recipe_scraper.py b/mealie/services/scraper/recipe_scraper.py
@@ -1,5 +1,7 @@
+from mealie.core.root_logger import get_logger
 from mealie.lang.providers import Translator
 from mealie.schema.recipe.recipe import Recipe
+from mealie.services.scraper import cleaner
 from mealie.services.scraper.scraped_extras import ScrapedExtras
 
 from .scraper_strategies import (
@@ -31,6 +33,7 @@ def __init__(self, translator: Translator, scrapers: list[type[ABCScraperStrateg
 
         self.scrapers = scrapers
         self.translator = translator
+        self.logger = get_logger()
 
     async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
         """
@@ -41,9 +44,23 @@ async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, Scrap
         raw_html = html or await safe_scrape_html(url)
         for scraper_type in self.scrapers:
             scraper = scraper_type(url, self.translator, raw_html=raw_html)
-            result = await scraper.parse()
 
-            if result is not None:
-                return result
+            try:
+                result = await scraper.parse()
+            except Exception:
+                self.logger.exception(f"Failed to scrape HTML with {scraper.__class__.__name__}")
+                result = None
+
+            if result is None or result[0] is None:
+                continue
+
+            recipe_result, extras = result
+            try:
+                recipe = cleaner.clean(recipe_result, self.translator)
+            except Exception:
+                self.logger.exception(f"Failed to clean recipe data from {scraper.__class__.__name__}")
+                continue
+
+            return recipe, extras
 
         return None, None
diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py
@@ -253,6 +253,18 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
     rather than trying to scrape it directly.
     """
 
+    def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str:
+        data_parts: list[str] = []
+        for script in soup.find_all("script", type="application/ld+json"):
+            try:
+                script_data = script.string
+                if script_data:
+                    data_parts.append(str(script_data))
+            except AttributeError:
+                pass
+
+        return "\n\n".join(data_parts)
+
     def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
         # find the open graph image tag
         og_image = soup.find("meta", property="og:image")
@@ -285,8 +297,10 @@ def format_html_to_text(self, html: str) -> str:
         soup = bs4.BeautifulSoup(html, "lxml")
 
         text = soup.get_text(separator="\n", strip=True)
+        text += self.extract_json_ld_data_from_html(soup)
         if not text:
-            raise Exception("No text found in HTML")
+            raise Exception("No text or ld+json data found in HTML")
+
         try:
             image = self.find_image(soup)
         except Exception:

diff --git a/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py b/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py
@@ -40,7 +40,7 @@
 def test_cleaner_clean(json_file: Path, num_steps):
     translator = local_provider()
     recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator)
-    assert len(recipe_data["recipeInstructions"]) == num_steps
+    assert len(recipe_data.recipe_instructions or []) == num_steps
 
 
 def test_html_with_recipe_data():