Skip to content

Commit

Permalink
Add a script to download & convert the CoNLL03 NER dataset from HF to…
Browse files Browse the repository at this point in the history
… the Stanza .json format
  • Loading branch information
AngledLuffa committed Nov 3, 2023
1 parent 5ecece3 commit 6f2f5d0
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 0 deletions.
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@
'transformers': [
'transformers>=3.0.0',
],
'datasets': [
'datasets',
]
},

# If there are data files included in your packages that need to be
Expand Down
39 changes: 39 additions & 0 deletions stanza/utils/datasets/ner/convert_en_conll03.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
Downloads (if necessary) conll03 from Huggingface, then converts it to Stanza .json
Some online sources for CoNLL 2003 require multiple pieces, but it is currently hosted on HF:
https://huggingface.co/datasets/conll2003
"""

import os

from stanza.utils.default_paths import get_default_paths
from stanza.utils.datasets.ner.utils import write_dataset

from datasets import load_dataset

TAG_TO_ID = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
ID_TO_TAG = {y: x for x, y in TAG_TO_ID.items()}

def convert_dataset_section(section):
sentences = []
for item in section:
words = item['tokens']
tags = [ID_TO_TAG[x] for x in item['ner_tags']]
sentences.append(list(zip(words, tags)))
return sentences

def process_dataset(short_name, conll_path, ner_output_path):
dataset = load_dataset('conll2003', cache_dir=conll_path)
datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
write_dataset(datasets, ner_output_path, short_name)

def main():
paths = get_default_paths()
ner_input_path = paths['NERBASE']
conll_path = os.path.join(ner_input_path, "en_conll03")
ner_output_path = paths['NER_DATA_DIR']
process_dataset("en_conll03", conll_path, ner_output_path)

if __name__ == '__main__':
main()
18 changes: 18 additions & 0 deletions stanza/utils/datasets/ner/prepare_ner_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,16 @@
OntoNotes 5 contains a Chinese NER dataset
- https://catalog.ldc.upenn.edu/LDC2013T19
en_conll03 is the classic 2003 4 class CoNLL dataset
- The version we use is posted on HuggingFace
- https://huggingface.co/datasets/conll2003
- The prepare script will download from HF
using the datasets package, then convert to json
- Introduction to the CoNLL-2003 Shared Task:
Language-Independent Named Entity Recognition
Tjong Kim Sang, Erik F. and De Meulder, Fien
-
en_conllpp is a test set from 2020 newswire
- https://arxiv.org/abs/2212.09747
- https://github.com/ShuhengL/acl2023_conllpp
Expand Down Expand Up @@ -415,6 +425,7 @@
import stanza.utils.datasets.ner.convert_bn_daffodil as convert_bn_daffodil
import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios
import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp
import stanza.utils.datasets.ner.convert_en_conll03 as convert_en_conll03
import stanza.utils.datasets.ner.convert_fire_2013 as convert_fire_2013
import stanza.utils.datasets.ner.convert_ijc as convert_ijc
import stanza.utils.datasets.ner.convert_kk_kazNERD as convert_kk_kazNERD
Expand Down Expand Up @@ -1070,6 +1081,12 @@ def process_armtdp(paths, short_name):
def process_toy_dataset(paths, short_name):
convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name)

def process_en_conll03(paths, short_name):
ner_input_path = paths['NERBASE']
conll_path = os.path.join(ner_input_path, "en_conll03")
ner_output_path = paths['NER_DATA_DIR']
convert_en_conll03.process_dataset("en_conll03", conll_path, ner_output_path)

def process_en_conllpp(paths, short_name):
"""
This is ONLY a test set
Expand All @@ -1095,6 +1112,7 @@ def process_ar_aqmar(paths, short_name):
"bn_daffodil": process_bn_daffodil,
"da_ddt": process_da_ddt,
"de_germeval2014": process_de_germeval2014,
"en_conll03": process_en_conll03,
"en_conllpp": process_en_conllpp,
"en_worldwide-4class": process_en_worldwide_4class,
"en_worldwide-8class": process_en_worldwide_8class,
Expand Down

0 comments on commit 6f2f5d0

Please sign in to comment.