Add the zh-hans dataset as well

stanfordnlp · Nov 4, 2023 · 1f34626 · 1f34626
1 parent bffc517
commit 1f34626
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 7 deletions.
diff --git a/...tils/datasets/ner/convert_en_ontonotes.py → ...a/utils/datasets/ner/convert_ontonotes.py b/...tils/datasets/ner/convert_en_ontonotes.py → ...a/utils/datasets/ner/convert_ontonotes.py
@@ -27,7 +27,15 @@ def process_dataset(short_name, conll_path, ner_output_path):
     except ImportError as e:
         raise ImportError("Please install the datasets package to process CoNLL03 with Stanza")
 
-    dataset = load_dataset("conll2012_ontonotesv5", "english_v12", cache_dir=conll_path)
+    if short_name == 'en_ontonotes':
+        config_name = 'english_v12'
+    elif short_name in ('zh_ontonotes', 'zh-hans_ontonotes'):
+        config_name = 'chinese_v4'
+    elif short_name == 'ar_ontonotes':
+        config_name = 'arabic_v4'
+    else:
+        raise ValueError("Unknown short name for downloading ontonotes: %s" % short_name)
+    dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=conll_path)
     datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
     write_dataset(datasets, ner_output_path, short_name)
 

diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -368,9 +368,6 @@
   - Then run
     python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp
 
-OntoNotes 5 contains a Chinese NER dataset
-  - https://catalog.ldc.upenn.edu/LDC2013T19
-
 en_conll03 is the classic 2003 4 class CoNLL dataset
   - The version we use is posted on HuggingFace
   - https://huggingface.co/datasets/conll2003
@@ -393,6 +390,13 @@
   - then run
     python3 stanza/utils/datasets/ner/prepare_ner_dataset.py en_conllpp
 
+en_ontonotes is the OntoNotes 5 on HuggingFace
+  - this downloads the "v12" version of the data
+
+zh-hans_ontonotes is the ZH split of the OntoNotes dataset
+  - https://catalog.ldc.upenn.edu/LDC2013T19
+
+
 AQMAR is a small dataset of Arabic Wikipedia articles
   - http://www.cs.cmu.edu/~ark/ArabicNER/
   - Recall-Oriented Learning of Named Entities in Arabic Wikipedia
@@ -429,14 +433,14 @@
 import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios
 import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp
 import stanza.utils.datasets.ner.convert_en_conll03 as convert_en_conll03
-import stanza.utils.datasets.ner.convert_en_ontonotes as convert_en_ontonotes
 import stanza.utils.datasets.ner.convert_fire_2013 as convert_fire_2013
 import stanza.utils.datasets.ner.convert_ijc as convert_ijc
 import stanza.utils.datasets.ner.convert_kk_kazNERD as convert_kk_kazNERD
 import stanza.utils.datasets.ner.convert_lst20 as convert_lst20
 import stanza.utils.datasets.ner.convert_nner22 as convert_nner22
 import stanza.utils.datasets.ner.convert_mr_l3cube as convert_mr_l3cube
 import stanza.utils.datasets.ner.convert_my_ucsy as convert_my_ucsy
+import stanza.utils.datasets.ner.convert_ontonotes as convert_ontonotes
 import stanza.utils.datasets.ner.convert_rgai as convert_rgai
 import stanza.utils.datasets.ner.convert_nytk as convert_nytk
 import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner
@@ -1089,7 +1093,13 @@ def process_en_ontonotes(paths, short_name):
     ner_input_path = paths['NERBASE']
     ontonotes_path = os.path.join(ner_input_path, "english", "en_ontonotes")
     ner_output_path = paths['NER_DATA_DIR']
-    convert_en_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path)
+    convert_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path)
+
+def process_zh_ontonotes(paths, short_name):
+    ner_input_path = paths['NERBASE']
+    ontonotes_path = os.path.join(ner_input_path, "chinese", "zh_ontonotes")
+    ner_output_path = paths['NER_DATA_DIR']
+    convert_ontonotes.process_dataset(short_name, ontonotes_path, ner_output_path)
 
 def process_en_conll03(paths, short_name):
     ner_input_path = paths['NERBASE']
@@ -1166,7 +1176,8 @@ def process_ar_aqmar(paths, short_name):
     "sv_suc3shuffle":    process_sv_suc3shuffle,
     "tr_starlang":       process_starlang,
     "th_lst20":          process_lst20,
-    "th_nner22":         process_nner22
+    "th_nner22":         process_nner22,
+    "zh-hans_ontonotes": process_zh_ontonotes,
 }
 
 def main(dataset_name):