deepset-ai · anakin87 · Nov 12, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
@@ -1,7 +1,7 @@
 loaders:
   - type: haystack_pydoc_tools.loaders.CustomPythonLoader
     search_path: [../../../haystack/components/rankers]
-    modules: ["lost_in_the_middle", "meta_field", "transformers_similarity", "sentence_transformers_diversity"]
+    modules: ["lost_in_the_middle", "meta_field", "meta_field_grouper_ranker", "transformers_similarity", "sentence_transformers_diversity"]
     ignore_when_discovered: ["__init__"]
 processors:
   - type: filter

@@ -4,12 +4,14 @@
 
 from haystack.components.rankers.lost_in_the_middle import LostInTheMiddleRanker
 from haystack.components.rankers.meta_field import MetaFieldRanker
+from haystack.components.rankers.meta_field_grouper_ranker import MetaFieldGroupingRanker
 from haystack.components.rankers.sentence_transformers_diversity import SentenceTransformersDiversityRanker
 from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker
 
 __all__ = [
     "LostInTheMiddleRanker",
     "MetaFieldRanker",
+    "MetaFieldGroupingRanker",
     "SentenceTransformersDiversityRanker",
     "TransformersSimilarityRanker",
 ]
diff --git a/haystack/components/rankers/meta_field_grouper_ranker.py b/haystack/components/rankers/meta_field_grouper_ranker.py
@@ -0,0 +1,206 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple
+
+from haystack import Document, component, logging
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class MetaFieldGroupingRanker:
+    """
+    Reorders the documents by grouping them based on metadata keys.
+
+    The MetaDataGrouper can group documents by a primary metadata key `group_by`, and subgroup them with an optional
+    secondary key, `subgroup_by`.
+    Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`.
+
+    The output is a flat list of documents ordered by `group_by` and `subgroup_by` values.
+    Any documents without a group are placed at the end of the list.
+
+    The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM.
+
+    ### Usage example
+
+    ```python
+    from haystack.components.rankers import MetaFieldGroupingRanker
+    from haystack.dataclasses import Document
+
+
+    docs = [
+        Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
+        Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
+        Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
+        Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
+        Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"})
+    ]
+
+    sample_meta_aggregator = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id")
+    result = sample_meta_aggregator.run(documents=docs)
+    print(result["documents"])
+    [
+        Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc,
+                content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}),
+        Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2,
+                content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}),
+        Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f,
+                content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}),
+        Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94,
+                content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}),
+        Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3,
+                content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'})
+    ]
+    ```
+    """  # noqa: E501
+
+    def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None):
+        """
+        Creates an instance of DeepsetMetadataGrouper.
+
+        :param group_by: The metadata key to aggregate the documents by.
+        :param subgroup_by: The metadata key to aggregate the documents within a group that was created by the
+                            `group_by` key.
+        :param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the
+                             documents within the groups or subgroups are not sorted and are kept in the same order as
+                             they were inserted in the subgroups.
+
+        """
+        self.group_by = group_by
+        self.sort_docs_by = sort_docs_by
+        self.subgroup_by = subgroup_by
+
+    def _group_documents(self, documents: List[Document]) -> Tuple[Dict[str, List], List, List]:
+        """
+        Go through all documents and bucket them based on the 'group_by' value.
+
+        If no 'group_by' value is present in the document, the document is added to a list of documents without a group.
+
+        :param documents:
+        :returns:
+            A tuple with the following elements:
+            - document_groups: A dictionary with the 'group_by' values as keys and the documents as values.
+            - no_group: A list of documents without a 'group_by' value.
+            - ordered_keys: A list of 'group_by' values in the order they were encountered.
+        """
+
+        document_groups: Dict[str, List] = defaultdict(list)
+        ordered_keys: List = []
+        no_group: List = []
+
+        for document in documents:
+            if self.group_by in document.meta:
+                group_by_value = str(document.meta[self.group_by])
+                document_groups[group_by_value].append(document)
+                if group_by_value not in ordered_keys:
+                    ordered_keys.append(group_by_value)
+            else:
+                no_group.append(document)
+
+        return document_groups, no_group, ordered_keys
+
+    def _create_subgroups(self, document_groups: Dict[str, List]) -> Tuple[Dict[str, Dict], List]:
+        """
+        Buckets the documents within the groups based on the 'subgroup_by' value.
+
+        If no 'subgroup_by' value is present in the document, the document is added to a subgroup with the key
+        'no_subgroup'.
+
+        :param document_groups: A dictionary with the 'group_by' values as keys and the documents as values.
+        :returns:
+            A tuple with the following elements:
+            - document_subgroups: A dictionary with the 'subgroup_by' values as keys and the documents as values.
+            - subgroup_ordered_keys: A list of 'subgroup_by' values in the order they were encountered
+        """
+        subgroup_ordered_keys = []
+        document_subgroups: Dict[str, Dict] = defaultdict(lambda: defaultdict(list))
+        if self.subgroup_by:
+            for key, value in document_groups.items():
+                for doc in value:
+                    if self.subgroup_by in doc.meta:
+                        subgroup_by_value = str(doc.meta[self.subgroup_by])
+                        document_subgroups[key][subgroup_by_value].append(doc)
+                        if subgroup_by_value not in subgroup_ordered_keys:
+                            subgroup_ordered_keys.append(subgroup_by_value)
+                    else:
+                        document_subgroups[key]["no_subgroup"].append(doc)
+                        if "no_subgroup" not in subgroup_ordered_keys:
+                            subgroup_ordered_keys.append("no_subgroup")
+
+        return document_subgroups, subgroup_ordered_keys
+
+    def _merge_and_sort(
+        self,
+        document_groups: Dict[str, List],
+        document_subgroups: Dict[str, Dict],
+        no_group: List,
+        ordered_keys: List,
+        subgroup_ordered_keys: List,
+    ):  # pylint: disable=too-many-positional-arguments
+        """
+        Sorts the documents within the groups or subgroups based on the 'sort_docs_by' value.
+
+        If 'sort_docs_by' is not provided, the documents are kept in the same order as they were inserted in the groups
+        and subgroups. The final list of documents is created by merging the groups, subgroups, and documents without a
+        group.
+
+        :param document_groups: A dictionary with the 'group_by' values as keys and the documents as values.
+        :param document_subgroups: A dictionary with the 'subgroup_by' values as keys and the documents as values.
+        :param no_group: A list of documents without a 'group_by' value.
+        :param ordered_keys: A list of 'group_by' values in the order they were encountered.
+        :param subgroup_ordered_keys: A list of 'subgroup_by' values in the order they were encountered.
+        :returns:
+            A list of documents ordered by the 'sort_docs_by' metadata values.
+        """
+        result_docs = []
+        if self.sort_docs_by and not self.subgroup_by:
+            for key in document_groups:
+                result_docs += sorted(document_groups[key], key=lambda d: d.meta.get(self.sort_docs_by, float("inf")))
+
+        elif self.sort_docs_by and self.subgroup_by:
+            for group in ordered_keys:
+                for subgroup in subgroup_ordered_keys:
+                    result_docs += sorted(
+                        document_subgroups[group][subgroup], key=lambda d: d.meta.get(self.sort_docs_by, float("inf"))
+                    )
+        else:
+            for key in document_groups:
+                result_docs += document_groups[key]
+
+        for doc in no_group:
+            if doc not in result_docs:
+                result_docs += [doc]
+
+        return result_docs
+
+    @component.output_types(documents=List[Document])
+    def run(self, documents: List[Document]) -> Dict[str, Any]:
+        """
+        Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`.
+
+        The output is a list of documents reordered based on how they were grouped.
+
+        :param documents: The list of documents to group.
+        :returns:
+            A dictionary with the following keys:
+            - documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values.
+        """
+
+        if len(documents) == 0:
+            return {"documents": []}
+
+        # docs based on the 'group_by' value
+        document_groups, no_group, ordered_keys = self._group_documents(documents)
+
+        # further grouping of the document inside each group based on the 'subgroup_by' value
+        document_subgroups, subgroup_ordered_keys = self._create_subgroups(document_groups)
+
+        # sort the docs within the groups or subgroups if necessary
+        result_docs = self._merge_and_sort(
+            document_groups, document_subgroups, no_group, ordered_keys, subgroup_ordered_keys
+        )
+
+        return {"documents": result_docs}
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    We have added a new MetaDataGrouper component that reorders documents by grouping them based on metadata keys. This can be useful pre-processing before feeding them to an LLM.
diff --git a/test/components/rankers/test_metadata_grouper.py b/test/components/rankers/test_metadata_grouper.py
@@ -0,0 +1,123 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict
+
+from haystack import Pipeline
+from haystack.dataclasses import Document
+
+from haystack.components.rankers.meta_field_grouper_ranker import MetaFieldGroupingRanker
+
+DOC_LIST = [
+    # regular
+    Document(content="Javascript is a popular language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
+    Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
+    Document(content="DNA carries genetic information", meta={"group": "314", "split_id": 1, "subgroup": "subE"}),
+    Document(content="Blue whales have a big heart", meta={"group": "11", "split_id": 8, "subgroup": "subF"}),
+    Document(content="Python is a popular  language", meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
+    Document(content="bla bla bla bla", meta={"split_id": 8, "subgroup": "subG"}),
+    Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"}),
+    Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
+    # without split id
+    Document(content="without split id", meta={"group": "11"}),
+    Document(content="without split id2", meta={"group": "22", "subgroup": "subI"}),
+    Document(content="without split id3", meta={"group": "11"}),
+    # with list values in the metadata
+    Document(content="list values", meta={"value_list": ["11"], "split_id": 8, "sub_value_list": ["subF"]}),
+    Document(content="list values2", meta={"value_list": ["12"], "split_id": 3, "sub_value_list": ["subX"]}),
+    Document(content="list values3", meta={"value_list": ["12"], "split_id": 8, "sub_value_list": ["subX"]}),
+]
+
+
+class TestMetaFieldSorter:
+    def test_init_default(self) -> None:
+        """
+        Test the default initialization of the MetaDataGrouper component.
+        """
+        sample_meta_aggregator = MetaFieldGroupingRanker(group_by="group", sort_docs_by=None)
+        result = sample_meta_aggregator.run(documents=[])
+        assert "documents" in result
+        assert result["documents"] == []
+
+    def test_run_group_by_only(self) -> None:
+        """
+        Test the MetaDataGrouper component with only the 'group_by' parameter. No subgroup or sorting is done.
+        """
+        sample_meta_aggregator = MetaFieldGroupingRanker(group_by="group")
+        result = sample_meta_aggregator.run(documents=DOC_LIST)
+        assert "documents" in result
+        assert len(DOC_LIST) == len(result["documents"])
+        assert result["documents"][0].meta["split_id"] == 7 and result["documents"][0].meta["group"] == "42"
+        assert result["documents"][1].meta["split_id"] == 4 and result["documents"][1].meta["group"] == "42"
+        assert result["documents"][2].meta["split_id"] == 3 and result["documents"][2].meta["group"] == "42"
+        assert result["documents"][3].meta["split_id"] == 2 and result["documents"][3].meta["group"] == "314"
+        assert result["documents"][4].meta["split_id"] == 1 and result["documents"][4].meta["group"] == "314"
+        assert result["documents"][5].meta["split_id"] == 8 and result["documents"][5].meta["group"] == "11"
+        assert result["documents"][6].meta["split_id"] == 2 and result["documents"][6].meta["group"] == "11"
+        assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
+        assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
+        assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
+        assert result["documents"][10].content == "bla bla bla bla"
+
+    def test_with_group_subgroup_and_sorting(self) -> None:
+        """
+        Test the MetaDataGrouper component with all parameters set, i.e.: grouping by 'group', subgrouping by 'subgroup',
+        and sorting by 'split_id'.
+        """
+        meta_aggregator = MetaFieldGroupingRanker(group_by="group", subgroup_by="subgroup", sort_docs_by="split_id")
+        result = meta_aggregator.run(documents=DOC_LIST)
+
+        assert "documents" in result
+        assert len(DOC_LIST) == len(result["documents"])
+        assert (
+            result["documents"][0].meta["subgroup"] == "subB"
+            and result["documents"][0].meta["group"] == "42"
+            and result["documents"][0].meta["split_id"] == 3
+        )
+        assert (
+            result["documents"][1].meta["subgroup"] == "subB"
+            and result["documents"][1].meta["group"] == "42"
+            and result["documents"][1].meta["split_id"] == 4
+        )
+        assert (
+            result["documents"][2].meta["subgroup"] == "subB"
+            and result["documents"][2].meta["group"] == "42"
+            and result["documents"][2].meta["split_id"] == 7
+        )
+        assert result["documents"][3].meta["subgroup"] == "subC" and result["documents"][3].meta["group"] == "314"
+        assert result["documents"][4].meta["subgroup"] == "subE" and result["documents"][4].meta["group"] == "314"
+        assert result["documents"][5].meta["subgroup"] == "subF" and result["documents"][6].meta["group"] == "11"
+        assert result["documents"][6].meta["subgroup"] == "subD" and result["documents"][5].meta["group"] == "11"
+        assert result["documents"][7].content == "without split id" and result["documents"][7].meta["group"] == "11"
+        assert result["documents"][8].content == "without split id3" and result["documents"][8].meta["group"] == "11"
+        assert result["documents"][9].content == "without split id2" and result["documents"][9].meta["group"] == "22"
+        assert result["documents"][10].content == "bla bla bla bla"
+
+    def test_run_with_lists(self) -> None:
+        """
+        Test if the MetaDataGrouper component can handle list values in the metadata.
+        """
+        meta_aggregator = MetaFieldGroupingRanker(
+            group_by="value_list", subgroup_by="subvaluelist", sort_docs_by="split_id"
+        )
+        result = meta_aggregator.run(documents=DOC_LIST)
+        assert "documents" in result
+        assert len(DOC_LIST) == len(result["documents"])
+        assert result["documents"][0].content == "list values" and result["documents"][0].meta["value_list"] == ["11"]
+        assert result["documents"][1].content == "list values2" and result["documents"][1].meta["value_list"] == ["12"]
+        assert result["documents"][2].content == "list values3" and result["documents"][2].meta["value_list"] == ["12"]
+
+    def test_run_in_pipeline_dumps_and_loads(self) -> None:
+        """
+        Test if the MetaDataGrouper component can be dumped to a YAML string and reloaded from it.
+        """
+        meta_aggregator = MetaFieldGroupingRanker(group_by="group", sort_docs_by="split_id")
+        result_single = meta_aggregator.run(documents=DOC_LIST)
+        pipeline = Pipeline()
+        pipeline.add_component("meta_aggregator", meta_aggregator)
+        pipeline_yaml_str = pipeline.dumps()
+        pipeline_reloaded = Pipeline().loads(pipeline_yaml_str)
+        result: Dict[str, Any] = pipeline_reloaded.run(data={"documents": DOC_LIST})
+        result = result["meta_aggregator"]
+        assert result_single == result