Add a script to download & convert the CoNLL03 NER dataset from HF to…

… the Stanza .json format
stanfordnlp · Nov 3, 2023 · 6f2f5d0 · 6f2f5d0
1 parent 5ecece3
commit 6f2f5d0
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 0 deletions.
diff --git a/setup.py b/setup.py
@@ -106,6 +106,9 @@
         'transformers': [
             'transformers>=3.0.0',
         ],
+        'datasets': [
+            'datasets',
+        ]
     },
 
     # If there are data files included in your packages that need to be

diff --git a/stanza/utils/datasets/ner/convert_en_conll03.py b/stanza/utils/datasets/ner/convert_en_conll03.py
@@ -0,0 +1,39 @@
+"""
+Downloads (if necessary) conll03 from Huggingface, then converts it to Stanza .json
+
+Some online sources for CoNLL 2003 require multiple pieces, but it is currently hosted on HF:
+https://huggingface.co/datasets/conll2003
+"""
+
+import os
+
+from stanza.utils.default_paths import get_default_paths
+from stanza.utils.datasets.ner.utils import write_dataset
+
+from datasets import load_dataset
+
+TAG_TO_ID = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
+ID_TO_TAG = {y: x for x, y in TAG_TO_ID.items()}
+
+def convert_dataset_section(section):
+    sentences = []
+    for item in section:
+        words = item['tokens']
+        tags = [ID_TO_TAG[x] for x in item['ner_tags']]
+        sentences.append(list(zip(words, tags)))
+    return sentences
+
+def process_dataset(short_name, conll_path, ner_output_path):
+    dataset = load_dataset('conll2003', cache_dir=conll_path)
+    datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
+    write_dataset(datasets, ner_output_path, short_name)
+
+def main():
+    paths = get_default_paths()
+    ner_input_path = paths['NERBASE']
+    conll_path = os.path.join(ner_input_path, "en_conll03")
+    ner_output_path = paths['NER_DATA_DIR']
+    process_dataset("en_conll03", conll_path, ner_output_path)
+
+if __name__ == '__main__':
+    main()
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -371,6 +371,16 @@
 OntoNotes 5 contains a Chinese NER dataset
   - https://catalog.ldc.upenn.edu/LDC2013T19
 
+en_conll03 is the classic 2003 4 class CoNLL dataset
+  - The version we use is posted on HuggingFace
+  - https://huggingface.co/datasets/conll2003
+  - The prepare script will download from HF
+    using the datasets package, then convert to json
+  - Introduction to the CoNLL-2003 Shared Task:
+    Language-Independent Named Entity Recognition
+    Tjong Kim Sang, Erik F. and De Meulder, Fien
+  - 
+
 en_conllpp is a test set from 2020 newswire
   - https://arxiv.org/abs/2212.09747
   - https://github.com/ShuhengL/acl2023_conllpp
@@ -415,6 +425,7 @@
 import stanza.utils.datasets.ner.convert_bn_daffodil as convert_bn_daffodil
 import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios
 import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp
+import stanza.utils.datasets.ner.convert_en_conll03 as convert_en_conll03
 import stanza.utils.datasets.ner.convert_fire_2013 as convert_fire_2013
 import stanza.utils.datasets.ner.convert_ijc as convert_ijc
 import stanza.utils.datasets.ner.convert_kk_kazNERD as convert_kk_kazNERD
@@ -1070,6 +1081,12 @@ def process_armtdp(paths, short_name):
 def process_toy_dataset(paths, short_name):
     convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name)
 
+def process_en_conll03(paths, short_name):
+    ner_input_path = paths['NERBASE']
+    conll_path = os.path.join(ner_input_path, "en_conll03")
+    ner_output_path = paths['NER_DATA_DIR']
+    convert_en_conll03.process_dataset("en_conll03", conll_path, ner_output_path)
+
 def process_en_conllpp(paths, short_name):
     """
     This is ONLY a test set
@@ -1095,6 +1112,7 @@ def process_ar_aqmar(paths, short_name):
     "bn_daffodil":       process_bn_daffodil,
     "da_ddt":            process_da_ddt,
     "de_germeval2014":   process_de_germeval2014,
+    "en_conll03":        process_en_conll03,
     "en_conllpp":        process_en_conllpp,
     "en_worldwide-4class": process_en_worldwide_4class,
     "en_worldwide-8class": process_en_worldwide_8class,