fix: Load additional fields from SQUAD-format file to meta field for …

…labels #5978 (#6301) * Load additional fields from SQUAD-format file to meta field for labels * added a test function * rewritten test using pytest * added release notes * improve release note * clean up test --------- Co-authored-by: Stefano Fiorucci <[email protected]>
deepset-ai · Nov 16, 2023 · c4cfe6c · c4cfe6c
1 parent c691412
commit c4cfe6c
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 0 deletions.
diff --git a/haystack/document_stores/utils.py b/haystack/document_stores/utils.py
@@ -172,6 +172,9 @@ def _extract_docs_and_labels_from_dict(
 
         ## Assign Labels to corresponding documents
         for qa in paragraph["qas"]:
+            meta_qa = {
+                k: v for k, v in qa.items() if k not in ("is_impossible", "answers", "question", "id", "missing")
+            }
             if not qa.get("is_impossible", False):
                 for answer in qa["answers"]:
                     ans = answer["text"]
@@ -191,6 +194,7 @@ def _extract_docs_and_labels_from_dict(
                             is_correct_answer=True,
                             is_correct_document=True,
                             origin="gold-label",
+                            meta=meta_qa,
                         )
                         labels.append(label)
                     else:
@@ -234,6 +238,7 @@ def _extract_docs_and_labels_from_dict(
                             is_correct_answer=True,
                             is_correct_document=True,
                             origin="gold-label",
+                            meta=meta_qa,
                         )
                         labels.append(label)
             else:
@@ -252,6 +257,7 @@ def _extract_docs_and_labels_from_dict(
                         is_correct_answer=True,
                         is_correct_document=True,
                         origin="gold-label",
+                        meta=meta_qa,
                     )
 
                     labels.append(label)

diff --git a/releasenotes/notes/metadata_enhancement_for_SQuAD_files-2247c72f07760465.yaml b/releasenotes/notes/metadata_enhancement_for_SQuAD_files-2247c72f07760465.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    Make it possible to load additional fields from the SQUAD format file into the meta field of the Labels
diff --git a/test/others/test_eval_data_from_json.py b/test/others/test_eval_data_from_json.py
@@ -0,0 +1,65 @@
+import json
+import os
+import pytest
+from haystack.document_stores import eval_data_from_json
+
+
+@pytest.fixture
+def temp_squad_file(tmp_path):
+    temp_filename = tmp_path / "temp_squad_file.json"
+    with open(temp_filename, "w", encoding="utf-8") as temp_file:
+        json.dump(
+            {
+                "metadata": {
+                    "dataset_version": "1.0",
+                    "description": "This dataset contains questions and answers related to...",
+                    "other_metadata_field": "value",
+                },
+                "data": [
+                    {
+                        "title": "Article Title",
+                        "paragraphs": [
+                            {
+                                "context": "This is the context of the article.",
+                                "qas": [
+                                    {
+                                        "question": "What is the SQuAD dataset?",
+                                        "id": 0,
+                                        "answers": [{"text": "This is the context", "answer_start": 0}],
+                                        "annotator": "annotator0",
+                                        "date": "2023-11-07",
+                                    },
+                                    {
+                                        "question": "Another question?",
+                                        "id": 1,
+                                        "answers": [{"text": "This is the context of the article", "answer_start": 0}],
+                                        "annotator": "annotator1",
+                                        "date": "2023-12-09",
+                                    },
+                                ],
+                            }
+                        ],
+                        "author": "Your Name",
+                        "creation_date": "2023-11-14",
+                    }
+                ],
+            },
+            temp_file,
+            indent=2,
+        )
+    return temp_filename
+
+
+def test_eval_data_from_json(temp_squad_file):
+    # Call the function with the temporary file
+    docs, labels = eval_data_from_json(temp_squad_file)
+
+    assert len(docs) == 1
+    assert len(labels) == 2
+
+    assert docs[0].content == "This is the context of the article."
+    assert labels[0].query == "What is the SQuAD dataset?"
+    assert labels[0].meta == {"annotator": "annotator0", "date": "2023-11-07"}
+
+    assert labels[1].query == "Another question?"
+    assert labels[1].meta == {"annotator": "annotator1", "date": "2023-12-09"}