From c4cfe6cb90a08a5a466c7e47786e662c3152848d Mon Sep 17 00:00:00 2001 From: x110 Date: Thu, 16 Nov 2023 13:44:51 +0400 Subject: [PATCH] fix: Load additional fields from SQUAD-format file to meta field for labels #5978 (#6301) * Load additional fields from SQUAD-format file to meta field for labels * added a test function * rewritten test using pytest * added release notes * improve release note * clean up test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> --- haystack/document_stores/utils.py | 6 ++ ...ment_for_SQuAD_files-2247c72f07760465.yaml | 4 ++ test/others/test_eval_data_from_json.py | 65 +++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 releasenotes/notes/metadata_enhancement_for_SQuAD_files-2247c72f07760465.yaml create mode 100644 test/others/test_eval_data_from_json.py diff --git a/haystack/document_stores/utils.py b/haystack/document_stores/utils.py index ab84882510..70c2b29cdc 100644 --- a/haystack/document_stores/utils.py +++ b/haystack/document_stores/utils.py @@ -172,6 +172,9 @@ def _extract_docs_and_labels_from_dict( ## Assign Labels to corresponding documents for qa in paragraph["qas"]: + meta_qa = { + k: v for k, v in qa.items() if k not in ("is_impossible", "answers", "question", "id", "missing") + } if not qa.get("is_impossible", False): for answer in qa["answers"]: ans = answer["text"] @@ -191,6 +194,7 @@ def _extract_docs_and_labels_from_dict( is_correct_answer=True, is_correct_document=True, origin="gold-label", + meta=meta_qa, ) labels.append(label) else: @@ -234,6 +238,7 @@ def _extract_docs_and_labels_from_dict( is_correct_answer=True, is_correct_document=True, origin="gold-label", + meta=meta_qa, ) labels.append(label) else: @@ -252,6 +257,7 @@ def _extract_docs_and_labels_from_dict( is_correct_answer=True, is_correct_document=True, origin="gold-label", + meta=meta_qa, ) labels.append(label) diff --git a/releasenotes/notes/metadata_enhancement_for_SQuAD_files-2247c72f07760465.yaml b/releasenotes/notes/metadata_enhancement_for_SQuAD_files-2247c72f07760465.yaml new file mode 100644 index 0000000000..ed90d46a25 --- /dev/null +++ b/releasenotes/notes/metadata_enhancement_for_SQuAD_files-2247c72f07760465.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Make it possible to load additional fields from the SQUAD format file into the meta field of the Labels diff --git a/test/others/test_eval_data_from_json.py b/test/others/test_eval_data_from_json.py new file mode 100644 index 0000000000..4e2959f43c --- /dev/null +++ b/test/others/test_eval_data_from_json.py @@ -0,0 +1,65 @@ +import json +import os +import pytest +from haystack.document_stores import eval_data_from_json + + +@pytest.fixture +def temp_squad_file(tmp_path): + temp_filename = tmp_path / "temp_squad_file.json" + with open(temp_filename, "w", encoding="utf-8") as temp_file: + json.dump( + { + "metadata": { + "dataset_version": "1.0", + "description": "This dataset contains questions and answers related to...", + "other_metadata_field": "value", + }, + "data": [ + { + "title": "Article Title", + "paragraphs": [ + { + "context": "This is the context of the article.", + "qas": [ + { + "question": "What is the SQuAD dataset?", + "id": 0, + "answers": [{"text": "This is the context", "answer_start": 0}], + "annotator": "annotator0", + "date": "2023-11-07", + }, + { + "question": "Another question?", + "id": 1, + "answers": [{"text": "This is the context of the article", "answer_start": 0}], + "annotator": "annotator1", + "date": "2023-12-09", + }, + ], + } + ], + "author": "Your Name", + "creation_date": "2023-11-14", + } + ], + }, + temp_file, + indent=2, + ) + return temp_filename + + +def test_eval_data_from_json(temp_squad_file): + # Call the function with the temporary file + docs, labels = eval_data_from_json(temp_squad_file) + + assert len(docs) == 1 + assert len(labels) == 2 + + assert docs[0].content == "This is the context of the article." + assert labels[0].query == "What is the SQuAD dataset?" + assert labels[0].meta == {"annotator": "annotator0", "date": "2023-11-07"} + + assert labels[1].query == "Another question?" + assert labels[1].meta == {"annotator": "annotator1", "date": "2023-12-09"}