Skip to content
This repository has been archived by the owner on Jul 4, 2023. It is now read-only.

Commit

Permalink
+ Update docs
Browse files Browse the repository at this point in the history
+ Remove data filtering
  • Loading branch information
PetrochukM committed Jul 1, 2020
1 parent e1fa942 commit 2bee590
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 27 deletions.
14 changes: 8 additions & 6 deletions tests/datasets/test_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ def test_squad_dataset_row(mock_urlretrieve):
assert len(train) > 0
assert len(dev) > 0

assert len(train) == 1450
assert len(dev) == 626
assert len(train) == 2
assert len(dev) == 2

assert train[5] == {
'question': 'In what R&B group was she the lead singer?',
'answer': ["Destiny's Child"]
}
assert train[0]['paragraphs'][0]['qas'][0]['question'] == (
'When did Beyonce start becoming popular?')
assert train[0]['paragraphs'][0]['qas'][0]['answers'] == [{
'text': 'in the late 1990s',
'answer_start': 269
}]
33 changes: 12 additions & 21 deletions torchnlp/datasets/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,13 @@ def squad_dataset(directory='data/',
"""
Load the Stanford Question Answering Dataset (SQuAD) dataset.
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
consisting of questions posed by crowdworkers on a set of Wikipedia articles,
where the answer to every question is a segment of text, or span,
from the corresponding reading passage, or the question might be unanswerable.
SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
questions written adversarially by crowdworkers to look similar to answerable ones.
To do well on SQuAD2.0, systems must not only answer questions when possible,
but also determine when no answer is supported by the paragraph
and abstain from answering.
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of
questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every
question is a segment of text, or span, from the corresponding reading passage, or the question
might be unanswerable. SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000
unanswerable questions written adversarially by crowdworkers to look similar to answerable
ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also
determine when no answer is supported by the paragraph and abstain from answering.
**Reference:** https://rajpurkar.github.io/SQuAD-explorer/
**Citation:**
Expand All @@ -50,8 +48,10 @@ def squad_dataset(directory='data/',
Example:
>>> from torchnlp.datasets import squad_dataset # doctest: +SKIP
>>> train = squad_dataset(train=True) # doctest: +SKIP
>>> train[0] # doctest: +SKIP
{'question': 'When did Beyonce start becoming popular?', 'answer': ['in the late 1990s']}
>>> train[0]['paragraphs'][0]['qas'][0]['question'] # doctest: +SKIP
'When did Beyonce start becoming popular?'
>>> train[0]['paragraphs'][0]['qas'][0]['answers'][0] # doctest: +SKIP
{'text': 'in the late 1990s', 'answer_start': 269}
"""
download_file_maybe_extract(url=url_dev, directory=directory, check_files=check_files_dev)
download_file_maybe_extract(url=url_train, directory=directory, check_files=check_files_train)
Expand All @@ -61,17 +61,8 @@ def squad_dataset(directory='data/',
splits = [f for (requested, f) in splits if requested]
for filename in splits:
full_path = os.path.join(directory, filename)
examples = []
with open(full_path, 'r') as temp:
dataset = json.load(temp)

for article in dataset['data']:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
question = qa['question']
answer = [a['text'] for a in qa['answers']]
examples.append({'question': question, 'answer': answer})
ret.append(examples)
ret.append(json.load(temp)['data'])

if len(ret) == 1:
return ret[0]
Expand Down

0 comments on commit 2bee590

Please sign in to comment.