From 1f3462614df7c6da5b605a5f760f1764af46e8cd Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Sat, 4 Nov 2023 14:23:49 -0700
Subject: [PATCH] Add the zh-hans dataset as well

---
 ...t_en_ontonotes.py => convert_ontonotes.py} | 10 +++++++-
 .../utils/datasets/ner/prepare_ner_dataset.py | 23 ++++++++++++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)
 rename stanza/utils/datasets/ner/{convert_en_ontonotes.py => convert_ontonotes.py} (80%)

diff --git a/stanza/utils/datasets/ner/convert_en_ontonotes.py b/stanza/utils/datasets/ner/convert_ontonotes.py
similarity index 80%
rename from stanza/utils/datasets/ner/convert_en_ontonotes.py
rename to stanza/utils/datasets/ner/convert_ontonotes.py
index 574038dfa6..cb9975ad3a 100644
--- a/stanza/utils/datasets/ner/convert_en_ontonotes.py
+++ b/stanza/utils/datasets/ner/convert_ontonotes.py
@@ -27,7 +27,15 @@ def process_dataset(short_name, conll_path, ner_output_path):
     except ImportError as e:
         raise ImportError("Please install the datasets package to process CoNLL03 with Stanza")
 
-    dataset = load_dataset("conll2012_ontonotesv5", "english_v12", cache_dir=conll_path)
+    if short_name == 'en_ontonotes':
+        config_name = 'english_v12'
+    elif short_name in ('zh_ontonotes', 'zh-hans_ontonotes'):
+        config_name = 'chinese_v4'
+    elif short_name == 'ar_ontonotes':
+        config_name = 'arabic_v4'
+    else:
+        raise ValueError("Unknown short name for downloading ontonotes: %s" % short_name)
+    dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=conll_path)
     datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
     write_dataset(datasets, ner_output_path, short_name)
 
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
index 570bddcbe2..e8a7a332da 100644
--- a/stanza/utils/datasets/ner/prepare_ner_dataset.py
+++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -368,9 +368,6 @@
   - Then run
     python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp
 
-OntoNotes 5 contains a Chinese NER dataset
-  - https://catalog.ldc.upenn.edu/LDC2013T19
-
 en_conll03 is the classic 2003 4 class CoNLL dataset
   - The version we use is posted on HuggingFace
   - https://huggingface.co/datasets/conll2003
@@ -393,6 +390,13 @@
   - then run
     python3 stanza/utils/datasets/ner/prepare_ner_dataset.py en_conllpp
 
+en_ontonotes is the OntoNotes 5 on HuggingFace
+  - this downloads the "v12" version of the data
+
+zh-hans_ontonotes is the ZH split of the OntoNotes dataset
+  - https://catalog.ldc.upenn.edu/LDC2013T19
+
+
 AQMAR is a small dataset of Arabic Wikipedia articles
   - http://www.cs.cmu.edu/~ark/ArabicNER/
   - Recall-Oriented Learning of Named Entities in Arabic Wikipedia
@@ -429,7 +433,6 @@
 import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios
 import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp
 import stanza.utils.datasets.ner.convert_en_conll03 as convert_en_conll03
-import stanza.utils.datasets.ner.convert_en_ontonotes as convert_en_ontonotes
 import stanza.utils.datasets.ner.convert_fire_2013 as convert_fire_2013
 import stanza.utils.datasets.ner.convert_ijc as convert_ijc
 import stanza.utils.datasets.ner.convert_kk_kazNERD as convert_kk_kazNERD
@@ -437,6 +440,7 @@
 import stanza.utils.datasets.ner.convert_nner22 as convert_nner22
 import stanza.utils.datasets.ner.convert_mr_l3cube as convert_mr_l3cube
 import stanza.utils.datasets.ner.convert_my_ucsy as convert_my_ucsy
+import stanza.utils.datasets.ner.convert_ontonotes as convert_ontonotes
 import stanza.utils.datasets.ner.convert_rgai as convert_rgai
 import stanza.utils.datasets.ner.convert_nytk as convert_nytk
 import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner
@@ -1089,7 +1093,13 @@ def process_en_ontonotes(paths, short_name):
     ner_input_path = paths['NERBASE']
     ontonotes_path = os.path.join(ner_input_path, "english", "en_ontonotes")
     ner_output_path = paths['NER_DATA_DIR']
-    convert_en_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path)
+    convert_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path)
+
+def process_zh_ontonotes(paths, short_name):
+    ner_input_path = paths['NERBASE']
+    ontonotes_path = os.path.join(ner_input_path, "chinese", "zh_ontonotes")
+    ner_output_path = paths['NER_DATA_DIR']
+    convert_ontonotes.process_dataset(short_name, ontonotes_path, ner_output_path)
 
 def process_en_conll03(paths, short_name):
     ner_input_path = paths['NERBASE']
@@ -1166,7 +1176,8 @@ def process_ar_aqmar(paths, short_name):
     "sv_suc3shuffle":    process_sv_suc3shuffle,
     "tr_starlang":       process_starlang,
     "th_lst20":          process_lst20,
-    "th_nner22":         process_nner22
+    "th_nner22":         process_nner22,
+    "zh-hans_ontonotes": process_zh_ontonotes,
 }
 
 def main(dataset_name):