Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding metadata grouper component #8512

Merged
merged 36 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0bb60f6
initial import
davidsbatista Oct 31, 2024
9d0e090
making tests more readable; adding docstring
davidsbatista Oct 31, 2024
834fe32
adding release notes
davidsbatista Oct 31, 2024
8f224fe
adding LICENSE header
davidsbatista Nov 1, 2024
ccf6080
Merge branch 'main' into add-metadata-grouper
davidsbatista Nov 1, 2024
9376b06
Merge branch 'main' into add-metadata-grouper
davidsbatista Nov 5, 2024
b6fa393
Update test/components/rankers/test_metadata_grouper.py
davidsbatista Nov 11, 2024
75a009b
refactoring
davidsbatista Nov 11, 2024
c6b76ef
fixing docstring
davidsbatista Nov 11, 2024
70eaec1
fixing types
davidsbatista Nov 11, 2024
20e3249
test docstrings
davidsbatista Nov 11, 2024
39b0e80
renaming test
davidsbatista Nov 11, 2024
cdc05bd
Merge branch 'main' into add-metadata-grouper
davidsbatista Nov 11, 2024
e47b4ed
handling too-many-arguments
davidsbatista Nov 11, 2024
a95a4f4
liting
davidsbatista Nov 11, 2024
8a53cb1
Update haystack/components/rankers/metadata_grouper.py
davidsbatista Nov 11, 2024
a3dcbd8
changing name
davidsbatista Nov 11, 2024
a546939
Update haystack/components/rankers/metadata_grouper.py
davidsbatista Nov 11, 2024
40bece7
Update haystack/components/rankers/metadata_grouper.py
davidsbatista Nov 11, 2024
7feeb7a
assiging value inside function for re-use
davidsbatista Nov 11, 2024
7366dcd
improving docstring
davidsbatista Nov 11, 2024
44b1289
updating name to MetaFieldGroupingRanker
davidsbatista Nov 11, 2024
3c603b5
adding to pydocs
davidsbatista Nov 11, 2024
6572262
fixing imports
davidsbatista Nov 11, 2024
ecac860
adding output docstring
davidsbatista Nov 11, 2024
7e205ef
Update haystack/components/rankers/meta_field_grouper_ranker.py
davidsbatista Nov 11, 2024
c3c624c
Update haystack/components/rankers/__init__.py
davidsbatista Nov 11, 2024
a60ac03
Update releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml
davidsbatista Nov 11, 2024
11e5b68
Update test/components/rankers/test_metadata_grouper.py
davidsbatista Nov 11, 2024
4630570
update docstring tests
davidsbatista Nov 11, 2024
1c883e4
fixing imports
davidsbatista Nov 11, 2024
18ed44c
rename modules for consistency
anakin87 Nov 11, 2024
23a8216
fix pydocs
anakin87 Nov 11, 2024
d7fab9b
Merge branch 'main' into add-metadata-grouper
davidsbatista Nov 12, 2024
bb7a710
simplification + more tests
anakin87 Nov 12, 2024
561e9f7
Merge branch 'main' into add-metadata-grouper
anakin87 Nov 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/pydoc/config/rankers_api.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/rankers]
modules: ["lost_in_the_middle", "meta_field", "transformers_similarity", "sentence_transformers_diversity"]
modules: ["lost_in_the_middle", "meta_field", "meta_field_grouper_ranker", "transformers_similarity", "sentence_transformers_diversity"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/rankers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

from haystack.components.rankers.lost_in_the_middle import LostInTheMiddleRanker
from haystack.components.rankers.meta_field import MetaFieldRanker
from haystack.components.rankers.meta_field_grouper_ranker import MetaFieldGroupingRanker
davidsbatista marked this conversation as resolved.
Show resolved Hide resolved
from haystack.components.rankers.sentence_transformers_diversity import SentenceTransformersDiversityRanker
from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker

__all__ = [
"LostInTheMiddleRanker",
"MetaFieldRanker",
"MetaFieldGroupingRanker",
"SentenceTransformersDiversityRanker",
"TransformersSimilarityRanker",
]
206 changes: 206 additions & 0 deletions haystack/components/rankers/meta_field_grouper_ranker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple

from haystack import Document, component, logging

logger = logging.getLogger(__name__)


@component
class MetaFieldGroupingRanker:
"""
Reorders the documents by grouping them based on metadata keys.

The MetaDataGrouper can group documents by a primary metadata key `group_by`, and subgroup them with an optional
davidsbatista marked this conversation as resolved.
Show resolved Hide resolved
secondary key, `subgroup_by`.
Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`.

The output is a flat list of documents ordered by `group_by` and `subgroup_by` values.
Any documents without a group are placed at the end of the list.

The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM.

### Usage example

```python
from haystack.components.rankers import MetaFieldGroupingRanker
from haystack.dataclasses import Document


docs = [
Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"})
]

sample_meta_aggregator = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id")
result = sample_meta_aggregator.run(documents=docs)
print(result["documents"])
[
Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc,
content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}),
Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2,
content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}),
Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f,
content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}),
Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94,
content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}),
Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3,
content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'})
]
```
""" # noqa: E501

def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None):
"""
Creates an instance of DeepsetMetadataGrouper.

:param group_by: The metadata key to aggregate the documents by.
:param subgroup_by: The metadata key to aggregate the documents within a group that was created by the
`group_by` key.
:param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the
documents within the groups or subgroups are not sorted and are kept in the same order as
they were inserted in the subgroups.

"""
self.group_by = group_by
self.sort_docs_by = sort_docs_by
self.subgroup_by = subgroup_by

def _group_documents(self, documents: List[Document]) -> Tuple[Dict[str, List], List, List]:
"""
Go through all documents and bucket them based on the 'group_by' value.

If no 'group_by' value is present in the document, the document is added to a list of documents without a group.

:param documents:
:returns:
A tuple with the following elements:
- document_groups: A dictionary with the 'group_by' values as keys and the documents as values.
- no_group: A list of documents without a 'group_by' value.
- ordered_keys: A list of 'group_by' values in the order they were encountered.
"""

document_groups: Dict[str, List] = defaultdict(list)
ordered_keys: List = []
no_group: List = []

for document in documents:
if self.group_by in document.meta:
group_by_value = str(document.meta[self.group_by])
document_groups[group_by_value].append(document)
if group_by_value not in ordered_keys:
ordered_keys.append(group_by_value)
else:
no_group.append(document)

return document_groups, no_group, ordered_keys

def _create_subgroups(self, document_groups: Dict[str, List]) -> Tuple[Dict[str, Dict], List]:
"""
Buckets the documents within the groups based on the 'subgroup_by' value.

If no 'subgroup_by' value is present in the document, the document is added to a subgroup with the key
'no_subgroup'.

:param document_groups: A dictionary with the 'group_by' values as keys and the documents as values.
:returns:
A tuple with the following elements:
- document_subgroups: A dictionary with the 'subgroup_by' values as keys and the documents as values.
- subgroup_ordered_keys: A list of 'subgroup_by' values in the order they were encountered
"""
subgroup_ordered_keys = []
document_subgroups: Dict[str, Dict] = defaultdict(lambda: defaultdict(list))
if self.subgroup_by:
for key, value in document_groups.items():
for doc in value:
if self.subgroup_by in doc.meta:
subgroup_by_value = str(doc.meta[self.subgroup_by])
document_subgroups[key][subgroup_by_value].append(doc)
if subgroup_by_value not in subgroup_ordered_keys:
subgroup_ordered_keys.append(subgroup_by_value)
else:
document_subgroups[key]["no_subgroup"].append(doc)
if "no_subgroup" not in subgroup_ordered_keys:
subgroup_ordered_keys.append("no_subgroup")

return document_subgroups, subgroup_ordered_keys

def _merge_and_sort(
self,
document_groups: Dict[str, List],
document_subgroups: Dict[str, Dict],
no_group: List,
ordered_keys: List,
subgroup_ordered_keys: List,
): # pylint: disable=too-many-positional-arguments
"""
Sorts the documents within the groups or subgroups based on the 'sort_docs_by' value.

If 'sort_docs_by' is not provided, the documents are kept in the same order as they were inserted in the groups
and subgroups. The final list of documents is created by merging the groups, subgroups, and documents without a
group.

:param document_groups: A dictionary with the 'group_by' values as keys and the documents as values.
:param document_subgroups: A dictionary with the 'subgroup_by' values as keys and the documents as values.
:param no_group: A list of documents without a 'group_by' value.
:param ordered_keys: A list of 'group_by' values in the order they were encountered.
:param subgroup_ordered_keys: A list of 'subgroup_by' values in the order they were encountered.
:returns:
A list of documents ordered by the 'sort_docs_by' metadata values.
"""
result_docs = []
if self.sort_docs_by and not self.subgroup_by:
for key in document_groups:
result_docs += sorted(document_groups[key], key=lambda d: d.meta.get(self.sort_docs_by, float("inf")))

elif self.sort_docs_by and self.subgroup_by:
for group in ordered_keys:
for subgroup in subgroup_ordered_keys:
result_docs += sorted(
document_subgroups[group][subgroup], key=lambda d: d.meta.get(self.sort_docs_by, float("inf"))
)
else:
for key in document_groups:
result_docs += document_groups[key]

for doc in no_group:
if doc not in result_docs:
result_docs += [doc]

return result_docs

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]) -> Dict[str, Any]:
"""
Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`.

The output is a list of documents reordered based on how they were grouped.

:param documents: The list of documents to group.
:returns:
A dictionary with the following keys:
- documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values.
"""

if len(documents) == 0:
return {"documents": []}

# docs based on the 'group_by' value
document_groups, no_group, ordered_keys = self._group_documents(documents)

# further grouping of the document inside each group based on the 'subgroup_by' value
document_subgroups, subgroup_ordered_keys = self._create_subgroups(document_groups)

# sort the docs within the groups or subgroups if necessary
result_docs = self._merge_and_sort(
document_groups, document_subgroups, no_group, ordered_keys, subgroup_ordered_keys
)

return {"documents": result_docs}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you see problems in a simple implementation like this?

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]) -> Dict[str, Any]:
    ...
        if not documents:
            return {"documents": []}

        document_groups = defaultdict(lambda: defaultdict(list))
        no_group_docs = []

        for doc in documents:
            group_value = str(doc.meta.get(self.group_by, ""))
            if group_value:
                subgroup_value = str(doc.meta.get(self.subgroup_by, "no_subgroup")) if self.subgroup_by else "no_subgroup"
                document_groups[group_value][subgroup_value].append(doc)
            else:
                no_group_docs.append(doc)

        ordered_docs = []
        for group in document_groups:
            for subgroup in document_groups[group]:
                docs = document_groups[group][subgroup]
                if self.sort_docs_by:
                    docs = sorted(docs, key=lambda d: d.meta.get(self.sort_docs_by, float("inf")))
                ordered_docs.extend(docs)

        ordered_docs.extend(no_group_docs)

        return {"documents": ordered_docs}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ju-gu can you test whether this simpler implementation is suitable for your needs? The tests are satisfied.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice, this seems to work as well, 👍

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
We have added a new MetaDataGrouper component that reorders documents by grouping them based on metadata keys. This can be useful pre-processing before feeding them to an LLM.
davidsbatista marked this conversation as resolved.
Show resolved Hide resolved
123 changes: 123 additions & 0 deletions test/components/rankers/test_metadata_grouper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict

from haystack import Pipeline
from haystack.dataclasses import Document

from haystack.components.rankers.meta_field_grouper_ranker import MetaFieldGroupingRanker

DOC_LIST = [
# regular
Document(content="Javascript is a popular language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
Document(content="DNA carries genetic information", meta={"group": "314", "split_id": 1, "subgroup": "subE"}),
Document(content="Blue whales have a big heart", meta={"group": "11", "split_id": 8, "subgroup": "subF"}),
Document(content="Python is a popular language", meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
Document(content="bla bla bla bla", meta={"split_id": 8, "subgroup": "subG"}),
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
# without split id
Document(content="without split id", meta={"group": "11"}),
Document(content="without split id2", meta={"group": "22", "subgroup": "subI"}),
Document(content="without split id3", meta={"group": "11"}),
# with list values in the metadata
Document(content="list values", meta={"value_list": ["11"], "split_id": 8, "sub_value_list": ["subF"]}),
Document(content="list values2", meta={"value_list": ["12"], "split_id": 3, "sub_value_list": ["subX"]}),
Document(content="list values3", meta={"value_list": ["12"], "split_id": 8, "sub_value_list": ["subX"]}),
]


class TestMetaFieldSorter:
davidsbatista marked this conversation as resolved.
Show resolved Hide resolved
def test_init_default(self) -> None:
"""
Test the default initialization of the MetaDataGrouper component.
davidsbatista marked this conversation as resolved.
Show resolved Hide resolved
"""
sample_meta_aggregator = MetaFieldGroupingRanker(group_by="group", sort_docs_by=None)
result = sample_meta_aggregator.run(documents=[])
assert "documents" in result
assert result["documents"] == []

def test_run_group_by_only(self) -> None:
"""
Test the MetaDataGrouper component with only the 'group_by' parameter. No subgroup or sorting is done.
"""
sample_meta_aggregator = MetaFieldGroupingRanker(group_by="group")
result = sample_meta_aggregator.run(documents=DOC_LIST)
assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert result["documents"][0].meta["split_id"] == 7 and result["documents"][0].meta["group"] == "42"
assert result["documents"][1].meta["split_id"] == 4 and result["documents"][1].meta["group"] == "42"
assert result["documents"][2].meta["split_id"] == 3 and result["documents"][2].meta["group"] == "42"
assert result["documents"][3].meta["split_id"] == 2 and result["documents"][3].meta["group"] == "314"
assert result["documents"][4].meta["split_id"] == 1 and result["documents"][4].meta["group"] == "314"
assert result["documents"][5].meta["split_id"] == 8 and result["documents"][5].meta["group"] == "11"
assert result["documents"][6].meta["split_id"] == 2 and result["documents"][6].meta["group"] == "11"
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
assert result["documents"][10].content == "bla bla bla bla"

def test_with_group_subgroup_and_sorting(self) -> None:
"""
Test the MetaDataGrouper component with all parameters set, i.e.: grouping by 'group', subgrouping by 'subgroup',
and sorting by 'split_id'.
"""
meta_aggregator = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
result = meta_aggregator.run(documents=DOC_LIST)

assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert (
result["documents"][0].meta["subgroup"] == "subB"
and result["documents"][0].meta["group"] == "42"
and result["documents"][0].meta["split_id"] == 3
)
assert (
result["documents"][1].meta["subgroup"] == "subB"
and result["documents"][1].meta["group"] == "42"
and result["documents"][1].meta["split_id"] == 4
)
assert (
result["documents"][2].meta["subgroup"] == "subB"
and result["documents"][2].meta["group"] == "42"
and result["documents"][2].meta["split_id"] == 7
)
assert result["documents"][3].meta["subgroup"] == "subC" and result["documents"][3].meta["group"] == "314"
assert result["documents"][4].meta["subgroup"] == "subE" and result["documents"][4].meta["group"] == "314"
assert result["documents"][5].meta["subgroup"] == "subF" and result["documents"][6].meta["group"] == "11"
assert result["documents"][6].meta["subgroup"] == "subD" and result["documents"][5].meta["group"] == "11"
assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
assert result["documents"][10].content == "bla bla bla bla"

def test_run_with_lists(self) -> None:
"""
Test if the MetaDataGrouper component can handle list values in the metadata.
"""
meta_aggregator = MetaFieldGroupingRanker(
group_by="value_list", subgroup_by="subvaluelist", sort_docs_by="split_id"
)
result = meta_aggregator.run(documents=DOC_LIST)
assert "documents" in result
assert len(DOC_LIST) == len(result["documents"])
assert result["documents"][0].content == "list values" and result["documents"][0].meta["value_list"] == ["11"]
assert result["documents"][1].content == "list values2" and result["documents"][1].meta["value_list"] == ["12"]
assert result["documents"][2].content == "list values3" and result["documents"][2].meta["value_list"] == ["12"]

def test_run_in_pipeline_dumps_and_loads(self) -> None:
"""
Test if the MetaDataGrouper component can be dumped to a YAML string and reloaded from it.
"""
meta_aggregator = MetaFieldGroupingRanker(group_by="group", sort_docs_by="split_id")
result_single = meta_aggregator.run(documents=DOC_LIST)
pipeline = Pipeline()
pipeline.add_component("meta_aggregator", meta_aggregator)
pipeline_yaml_str = pipeline.dumps()
pipeline_reloaded = Pipeline().loads(pipeline_yaml_str)
result: Dict[str, Any] = pipeline_reloaded.run(data={"documents": DOC_LIST})
result = result["meta_aggregator"]
assert result_single == result