Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix the absence of matching synonyms during category insight import #1497

Merged
merged 2 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions data/taxonomies/categories.full.json.gz
Git LFS file not shown
17 changes: 8 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ lark = "~1.1.4"
h5py = "~3.8.0"
opencv-python-headless = "~4.10.0.84"
toml = "~0.10.2"
openfoodfacts = "2.4.0"
openfoodfacts = "2.5.0"
imagehash = "~4.3.1"
peewee-migrate = "~1.12.2"
diskcache = "~5.6.3"
Expand Down
27 changes: 21 additions & 6 deletions robotoff/insights/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,16 +835,31 @@
predictions: list[Prediction],
product_id: ProductIdentifier,
) -> Iterator[ProductInsight]:
candidates = [
prediction
for prediction in predictions
if cls.is_prediction_valid(product, prediction.value_tag) # type: ignore
]
taxonomy = get_taxonomy(InsightType.category.name)
selected_candidates = []
for prediction in predictions:
if prediction.value_tag is None:
logger.warning(

Check warning on line 842 in robotoff/insights/importer.py

View check run for this annotation

Codecov / codecov/patch

robotoff/insights/importer.py#L842

Added line #L842 was not covered by tests
"Unexpected None `value_tag` (prediction: %s)", prediction
)
continue

Check warning on line 845 in robotoff/insights/importer.py

View check run for this annotation

Codecov / codecov/patch

robotoff/insights/importer.py#L845

Added line #L845 was not covered by tests
else:
prediction.value_tag = match_taxonomized_value(
prediction.value_tag, TaxonomyType.category.name
)
if prediction.value_tag is None:
logger.warning(f"Could not match {prediction.value_tag} (category)")
continue
elif not cls.is_prediction_valid(product, prediction.value_tag):
continue
else:
selected_candidates.append(prediction)

yield from (
ProductInsight(**candidate.to_dict())
for candidate in select_deepest_taxonomized_candidates(candidates, taxonomy)
for candidate in select_deepest_taxonomized_candidates(
selected_candidates, taxonomy
)
)

@staticmethod
Expand Down
36 changes: 17 additions & 19 deletions robotoff/taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import collections
from typing import Optional

from cachetools.func import ttl_cache
from openfoodfacts.taxonomy import Taxonomy
from openfoodfacts.taxonomy import (
Taxonomy,
create_brand_taxonomy_mapping,
create_taxonomy_mapping,
)
from openfoodfacts.taxonomy import get_taxonomy as _get_taxonomy
from openfoodfacts.types import TaxonomyType

from robotoff import settings
from robotoff.utils import get_logger
from robotoff.utils.cache import function_cache_register
from robotoff.utils.text import get_tag

logger = get_logger(__name__)

Expand Down Expand Up @@ -65,6 +67,7 @@ def get_taxonomy(taxonomy_type: TaxonomyType | str, offline: bool = False) -> Ta
return _get_taxonomy(
taxonomy_type_enum,
force_download=False,
download_newer=True,
cache_dir=settings.DATA_DIR / "taxonomies",
)

Expand All @@ -84,29 +87,24 @@ def get_taxonomy_mapping(taxonomy_type: str) -> dict[str, str]:
"""
logger.debug("Loading taxonomy mapping %s...", taxonomy_type)
taxonomy = get_taxonomy(taxonomy_type)
ids: dict[str, str] = {}

for key in taxonomy.keys():
if taxonomy_type == TaxonomyType.brand.name:
unprefixed_key = key
if is_prefixed_value(key):
unprefixed_key = key[3:]
ids[unprefixed_key] = taxonomy[key].names["en"]
else:
for lang, name in taxonomy[key].names.items():
tag = get_tag(name)
ids[f"{lang}:{tag}"] = key
if taxonomy_type == TaxonomyType.brand.name:
return create_brand_taxonomy_mapping(taxonomy)
else:
return create_taxonomy_mapping(taxonomy)

return ids


def match_taxonomized_value(value_tag: str, taxonomy_type: str) -> Optional[str]:
def match_taxonomized_value(value_tag: str, taxonomy_type: str) -> str | None:
"""Return the canonical taxonomized value of a `value_tag` (if any) or
return None if no match was found or if the type is unsupported.

Currently it only works for brand and label.
Currently it only works for brand, label and category taxonomies.
"""
if taxonomy_type not in (TaxonomyType.brand.name, TaxonomyType.label.name):
if taxonomy_type not in (
TaxonomyType.brand.name,
TaxonomyType.label.name,
TaxonomyType.category.name,
):
return None

taxonomy = get_taxonomy(taxonomy_type)
Expand Down
7 changes: 7 additions & 0 deletions tests/unit/insights/test_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,13 @@ def test_is_parent_category(self, category, to_check_categories, expected, mocke
Product({"code": DEFAULT_BARCODE, "categories_tags": ["en:meats"]}),
[],
),
(
[
Prediction(PredictionType.category, value_tag="en:shelled-almonds"),
],
Product({"code": DEFAULT_BARCODE, "categories_tags": []}),
["en:almonds-shelled"],
),
(
[
Prediction(
Expand Down
97 changes: 4 additions & 93 deletions tests/unit/test_taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,6 @@
import pytest

from robotoff import settings
from robotoff.taxonomy import Taxonomy, TaxonomyType, match_taxonomized_value

label_taxonomy = Taxonomy.from_path(settings.TAXONOMY_PATHS["label"])
category_taxonomy = Taxonomy.from_path(settings.TAXONOMY_PATHS["category"])


class TestTaxonomy:
@pytest.mark.parametrize(
"taxonomy,item,candidates,output",
[
(label_taxonomy, "en:organic", {"en:fr-bio-01"}, True),
(label_taxonomy, "en:fr-bio-01", {"en:organic"}, False),
(label_taxonomy, "en:fr-bio-01", [], False),
(label_taxonomy, "en:organic", {"en:gluten-free"}, False),
(
label_taxonomy,
"en:organic",
{"en:gluten-free", "en:no-additives", "en:vegan"},
False,
),
(
label_taxonomy,
"en:organic",
{"en:gluten-free", "en:no-additives", "en:fr-bio-16"},
True,
),
],
)
def test_is_child_of_any(
self, taxonomy: Taxonomy, item: str, candidates: list, output: bool
):
assert taxonomy.is_parent_of_any(item, candidates) is output

def test_is_child_of_any_unknwon_item(self):
with pytest.raises(ValueError):
label_taxonomy.is_parent_of_any("unknown-id", set())

@pytest.mark.parametrize(
"taxonomy,item,output",
[
(category_taxonomy, "en:plant-based-foods-and-beverages", set()),
(
category_taxonomy,
"en:plant-based-foods",
{"en:plant-based-foods-and-beverages"},
),
(
category_taxonomy,
"en:brown-rices",
{
"en:rices",
"en:cereal-grains",
"en:cereals-and-their-products",
"en:cereals-and-potatoes",
"en:plant-based-foods",
"en:plant-based-foods-and-beverages",
"en:seeds",
},
),
],
)
def test_get_parents_hierarchy(
self, taxonomy: Taxonomy, item: str, output: set[str]
):
node = taxonomy[item]
parents = node.get_parents_hierarchy()
assert set((x.id for x in parents)) == output

@pytest.mark.parametrize(
"taxonomy,items,output",
[
(category_taxonomy, [], []),
(category_taxonomy, ["en:brown-rices"], ["en:brown-rices"]),
(category_taxonomy, ["en:brown-rices", "en:rices"], ["en:brown-rices"]),
(
category_taxonomy,
["en:brown-rices", "en:rices", "en:cereal-grains"],
["en:brown-rices"],
),
(
category_taxonomy,
["en:brown-rices", "en:teas", "en:cereal-grains"],
["en:brown-rices", "en:teas"],
),
],
)
def test_find_deepest_nodes(
self, taxonomy: Taxonomy, items: list[str], output: list[str]
):
item_nodes = [taxonomy[item] for item in items]
output_nodes = [taxonomy[o] for o in output]
assert taxonomy.find_deepest_nodes(item_nodes) == output_nodes
from robotoff.taxonomy import TaxonomyType, match_taxonomized_value


@pytest.mark.parametrize(
Expand All @@ -114,6 +22,9 @@ def test_find_deepest_nodes(
(TaxonomyType.label.name, "unknown-label", None),
(TaxonomyType.label.name, "fr:viande-bovine-francaise", "en:french-beef"),
(TaxonomyType.ingredient.name, "text", None), # unsupported taxonomy
# en:almonds-shelled is the canonical ID, we check here that synonyms are
# matched
(TaxonomyType.category.name, "en:shelled-almonds", "en:almonds-shelled"),
],
)
def test_match_taxonomized_value(taxonomy_type, value, expected):
Expand Down
Loading