From 2bee5905ae6368d61922d236338fffdfb1eecbe5 Mon Sep 17 00:00:00 2001 From: deepblue129 Date: Tue, 30 Jun 2020 21:39:58 -0700 Subject: [PATCH] + Update docs + Remove data filtering --- tests/datasets/test_squad.py | 14 ++++++++------ torchnlp/datasets/squad.py | 33 ++++++++++++--------------------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/tests/datasets/test_squad.py b/tests/datasets/test_squad.py index 85b19c3..ed4cb25 100644 --- a/tests/datasets/test_squad.py +++ b/tests/datasets/test_squad.py @@ -15,10 +15,12 @@ def test_squad_dataset_row(mock_urlretrieve): assert len(train) > 0 assert len(dev) > 0 - assert len(train) == 1450 - assert len(dev) == 626 + assert len(train) == 2 + assert len(dev) == 2 - assert train[5] == { - 'question': 'In what R&B group was she the lead singer?', - 'answer': ["Destiny's Child"] - } + assert train[0]['paragraphs'][0]['qas'][0]['question'] == ( + 'When did Beyonce start becoming popular?') + assert train[0]['paragraphs'][0]['qas'][0]['answers'] == [{ + 'text': 'in the late 1990s', + 'answer_start': 269 + }] diff --git a/torchnlp/datasets/squad.py b/torchnlp/datasets/squad.py index 4bc0f17..52a5aaa 100644 --- a/torchnlp/datasets/squad.py +++ b/torchnlp/datasets/squad.py @@ -15,15 +15,13 @@ def squad_dataset(directory='data/', """ Load the Stanford Question Answering Dataset (SQuAD) dataset. - Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, - consisting of questions posed by crowdworkers on a set of Wikipedia articles, - where the answer to every question is a segment of text, or span, - from the corresponding reading passage, or the question might be unanswerable. - SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable - questions written adversarially by crowdworkers to look similar to answerable ones. - To do well on SQuAD2.0, systems must not only answer questions when possible, - but also determine when no answer is supported by the paragraph - and abstain from answering. + Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of + questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every + question is a segment of text, or span, from the corresponding reading passage, or the question + might be unanswerable. SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 + unanswerable questions written adversarially by crowdworkers to look similar to answerable + ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also + determine when no answer is supported by the paragraph and abstain from answering. **Reference:** https://rajpurkar.github.io/SQuAD-explorer/ **Citation:** @@ -50,8 +48,10 @@ def squad_dataset(directory='data/', Example: >>> from torchnlp.datasets import squad_dataset # doctest: +SKIP >>> train = squad_dataset(train=True) # doctest: +SKIP - >>> train[0] # doctest: +SKIP - {'question': 'When did Beyonce start becoming popular?', 'answer': ['in the late 1990s']} + >>> train[0]['paragraphs'][0]['qas'][0]['question'] # doctest: +SKIP + 'When did Beyonce start becoming popular?' + >>> train[0]['paragraphs'][0]['qas'][0]['answers'][0] # doctest: +SKIP + {'text': 'in the late 1990s', 'answer_start': 269} """ download_file_maybe_extract(url=url_dev, directory=directory, check_files=check_files_dev) download_file_maybe_extract(url=url_train, directory=directory, check_files=check_files_train) @@ -61,17 +61,8 @@ def squad_dataset(directory='data/', splits = [f for (requested, f) in splits if requested] for filename in splits: full_path = os.path.join(directory, filename) - examples = [] with open(full_path, 'r') as temp: - dataset = json.load(temp) - - for article in dataset['data']: - for paragraph in article['paragraphs']: - for qa in paragraph['qas']: - question = qa['question'] - answer = [a['text'] for a in qa['answers']] - examples.append({'question': question, 'answer': answer}) - ret.append(examples) + ret.append(json.load(temp)['data']) if len(ret) == 1: return ret[0]