Skip to content

Commit

Permalink
fix: Load additional fields from SQUAD-format file to meta field for …
Browse files Browse the repository at this point in the history
…labels #5978 (#6301)

* Load additional fields from SQUAD-format file to meta field for labels

* added a test function

* rewritten test using pytest

* added release notes

* improve release note

* clean up test

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
x110 and anakin87 authored Nov 16, 2023
1 parent c691412 commit c4cfe6c
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 0 deletions.
6 changes: 6 additions & 0 deletions haystack/document_stores/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ def _extract_docs_and_labels_from_dict(

## Assign Labels to corresponding documents
for qa in paragraph["qas"]:
meta_qa = {
k: v for k, v in qa.items() if k not in ("is_impossible", "answers", "question", "id", "missing")
}
if not qa.get("is_impossible", False):
for answer in qa["answers"]:
ans = answer["text"]
Expand All @@ -191,6 +194,7 @@ def _extract_docs_and_labels_from_dict(
is_correct_answer=True,
is_correct_document=True,
origin="gold-label",
meta=meta_qa,
)
labels.append(label)
else:
Expand Down Expand Up @@ -234,6 +238,7 @@ def _extract_docs_and_labels_from_dict(
is_correct_answer=True,
is_correct_document=True,
origin="gold-label",
meta=meta_qa,
)
labels.append(label)
else:
Expand All @@ -252,6 +257,7 @@ def _extract_docs_and_labels_from_dict(
is_correct_answer=True,
is_correct_document=True,
origin="gold-label",
meta=meta_qa,
)

labels.append(label)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Make it possible to load additional fields from the SQUAD format file into the meta field of the Labels
65 changes: 65 additions & 0 deletions test/others/test_eval_data_from_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import json
import os
import pytest
from haystack.document_stores import eval_data_from_json


@pytest.fixture
def temp_squad_file(tmp_path):
temp_filename = tmp_path / "temp_squad_file.json"
with open(temp_filename, "w", encoding="utf-8") as temp_file:
json.dump(
{
"metadata": {
"dataset_version": "1.0",
"description": "This dataset contains questions and answers related to...",
"other_metadata_field": "value",
},
"data": [
{
"title": "Article Title",
"paragraphs": [
{
"context": "This is the context of the article.",
"qas": [
{
"question": "What is the SQuAD dataset?",
"id": 0,
"answers": [{"text": "This is the context", "answer_start": 0}],
"annotator": "annotator0",
"date": "2023-11-07",
},
{
"question": "Another question?",
"id": 1,
"answers": [{"text": "This is the context of the article", "answer_start": 0}],
"annotator": "annotator1",
"date": "2023-12-09",
},
],
}
],
"author": "Your Name",
"creation_date": "2023-11-14",
}
],
},
temp_file,
indent=2,
)
return temp_filename


def test_eval_data_from_json(temp_squad_file):
# Call the function with the temporary file
docs, labels = eval_data_from_json(temp_squad_file)

assert len(docs) == 1
assert len(labels) == 2

assert docs[0].content == "This is the context of the article."
assert labels[0].query == "What is the SQuAD dataset?"
assert labels[0].meta == {"annotator": "annotator0", "date": "2023-11-07"}

assert labels[1].query == "Another question?"
assert labels[1].meta == {"annotator": "annotator1", "date": "2023-12-09"}

0 comments on commit c4cfe6c

Please sign in to comment.