Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Automatically keep user-agent up to date with latest user-agent #3529

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions mealie/services/recipe/recipe_data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import shutil
from pathlib import Path

import simple_useragent as sua
from httpx import AsyncClient, Response
from pydantic import UUID4

Expand All @@ -10,13 +11,6 @@
from mealie.schema.recipe.recipe import Recipe
from mealie.services._base_service import BaseService

try:
from recipe_scrapers._abstract import HEADERS

_FIREFOX_UA = HEADERS["User-Agent"]
except (ImportError, KeyError):
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"


async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
semaphore = asyncio.Semaphore(n)
Expand All @@ -38,7 +32,7 @@ async def largest_content_len(urls: list[str]) -> tuple[str, int]:
max_concurrency = 10

async def do(client: AsyncClient, url: str) -> Response:
return await client.head(url, headers={"User-Agent": _FIREFOX_UA})
return await client.head(url, headers={"User-Agent": sua.get_list(num=1)[0]})

async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
tasks = [do(client, url) for url in urls]
Expand Down Expand Up @@ -141,7 +135,7 @@ async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> Non

async with AsyncClient(transport=AsyncSafeTransport()) as client:
try:
r = await client.get(image_url_str, headers={"User-Agent": _FIREFOX_UA})
r = await client.get(image_url_str, headers={"User-Agent": sua.get_list(num=1)[0]})
except Exception:
self.logger.exception("Fatal Image Request Exception")
return None
Expand Down
13 changes: 4 additions & 9 deletions mealie/services/scraper/scraper_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any

import extruct
import simple_useragent as sua
from fastapi import HTTPException, status
from httpx import AsyncClient
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
Expand All @@ -18,14 +19,6 @@

from . import cleaner

try:
from recipe_scrapers._abstract import HEADERS

_FIREFOX_UA = HEADERS["User-Agent"]
except (ImportError, KeyError):
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"


SCRAPER_TIMEOUT = 15


Expand All @@ -41,7 +34,9 @@
"""
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
html_bytes = b""
async with client.stream("GET", url, timeout=SCRAPER_TIMEOUT, headers={"User-Agent": _FIREFOX_UA}) as resp:
async with client.stream(
"GET", url, timeout=SCRAPER_TIMEOUT, headers={"User-Agent": sua.get_list(num=1)[0]}
) as resp:
Dismissed Show dismissed Hide dismissed
start_time = time.time()

async for chunk in resp.aiter_bytes(chunk_size=1024):
Expand Down
Loading
Loading