Skip to content

Commit

Permalink
feat: Improve Recipe Imports with Cleaner (#4517)
Browse files Browse the repository at this point in the history
  • Loading branch information
michael-genson authored Nov 13, 2024
1 parent 085c489 commit bcd0fcc
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 11 deletions.
5 changes: 2 additions & 3 deletions mealie/services/migrations/_migration_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,5 @@ def clean_recipe_dictionary(self, recipe_dict: dict) -> Recipe:
with contextlib.suppress(KeyError):
del recipe_dict["id"]

recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))

return Recipe(**recipe_dict)
recipe = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
return recipe
2 changes: 2 additions & 0 deletions mealie/services/recipe/recipe_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from mealie.services._base_service import BaseService
from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService
from mealie.services.recipe.recipe_data_service import RecipeDataService
from mealie.services.scraper import cleaner

from .template_service import TemplateService

Expand Down Expand Up @@ -297,6 +298,7 @@ async def create_from_images(self, images: list[UploadFile], translate_language:
recipe_data = await openai_recipe_service.build_recipe_from_images(
local_images, translate_language=translate_language
)
recipe_data = cleaner.clean(recipe_data, self.translator)

recipe = self.create_one(recipe_data)
data_service = RecipeDataService(recipe.id)
Expand Down
14 changes: 11 additions & 3 deletions mealie/services/scraper/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe

logger = get_logger("recipe-scraper")

Expand All @@ -33,16 +34,23 @@
""" Matches multiple new lines and removes erroneous white space """


def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
def clean(recipe_data: Recipe | dict, translator: Translator, url=None) -> Recipe:
"""Main entrypoint to clean a recipe extracted from the web
and format the data into an accectable format for the database
Args:
recipe_data (dict): raw recipe dicitonary
recipe_data (dict): raw recipe or recipe dictionary
Returns:
dict: cleaned recipe dictionary
"""
if not isinstance(recipe_data, dict):
# format the recipe like a scraped dictionary
recipe_data_dict = recipe_data.model_dump(by_alias=True)
recipe_data_dict["recipeIngredient"] = [ing.display for ing in recipe_data.recipe_ingredient]

recipe_data = recipe_data_dict

recipe_data["description"] = clean_string(recipe_data.get("description", ""))

# Times
Expand All @@ -59,7 +67,7 @@ def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
recipe_data["notes"] = clean_notes(recipe_data.get("notes"))
recipe_data["rating"] = clean_int(recipe_data.get("rating"))

return recipe_data
return Recipe(**recipe_data)


def clean_string(text: str | list | int) -> str:
Expand Down
23 changes: 20 additions & 3 deletions mealie/services/scraper/recipe_scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe
from mealie.services.scraper import cleaner
from mealie.services.scraper.scraped_extras import ScrapedExtras

from .scraper_strategies import (
Expand Down Expand Up @@ -31,6 +33,7 @@ def __init__(self, translator: Translator, scrapers: list[type[ABCScraperStrateg

self.scrapers = scrapers
self.translator = translator
self.logger = get_logger()

async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
"""
Expand All @@ -41,9 +44,23 @@ async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, Scrap
raw_html = html or await safe_scrape_html(url)
for scraper_type in self.scrapers:
scraper = scraper_type(url, self.translator, raw_html=raw_html)
result = await scraper.parse()

if result is not None:
return result
try:
result = await scraper.parse()
except Exception:
self.logger.exception(f"Failed to scrape HTML with {scraper.__class__.__name__}")
result = None

if result is None or result[0] is None:
continue

recipe_result, extras = result
try:
recipe = cleaner.clean(recipe_result, self.translator)
except Exception:
self.logger.exception(f"Failed to clean recipe data from {scraper.__class__.__name__}")
continue

return recipe, extras

return None, None
16 changes: 15 additions & 1 deletion mealie/services/scraper/scraper_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,18 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
rather than trying to scrape it directly.
"""

def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str:
data_parts: list[str] = []
for script in soup.find_all("script", type="application/ld+json"):
try:
script_data = script.string
if script_data:
data_parts.append(str(script_data))
except AttributeError:
pass

return "\n\n".join(data_parts)

def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
# find the open graph image tag
og_image = soup.find("meta", property="og:image")
Expand Down Expand Up @@ -285,8 +297,10 @@ def format_html_to_text(self, html: str) -> str:
soup = bs4.BeautifulSoup(html, "lxml")

text = soup.get_text(separator="\n", strip=True)
text += self.extract_json_ld_data_from_html(soup)
if not text:
raise Exception("No text found in HTML")
raise Exception("No text or ld+json data found in HTML")

try:
image = self.find_image(soup)
except Exception:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
def test_cleaner_clean(json_file: Path, num_steps):
translator = local_provider()
recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator)
assert len(recipe_data["recipeInstructions"]) == num_steps
assert len(recipe_data.recipe_instructions or []) == num_steps


def test_html_with_recipe_data():
Expand Down

0 comments on commit bcd0fcc

Please sign in to comment.