From 1f3462614df7c6da5b605a5f760f1764af46e8cd Mon Sep 17 00:00:00 2001 From: John Bauer Date: Sat, 4 Nov 2023 14:23:49 -0700 Subject: [PATCH] Add the zh-hans dataset as well --- ...t_en_ontonotes.py => convert_ontonotes.py} | 10 +++++++- .../utils/datasets/ner/prepare_ner_dataset.py | 23 ++++++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) rename stanza/utils/datasets/ner/{convert_en_ontonotes.py => convert_ontonotes.py} (80%) diff --git a/stanza/utils/datasets/ner/convert_en_ontonotes.py b/stanza/utils/datasets/ner/convert_ontonotes.py similarity index 80% rename from stanza/utils/datasets/ner/convert_en_ontonotes.py rename to stanza/utils/datasets/ner/convert_ontonotes.py index 574038dfa6..cb9975ad3a 100644 --- a/stanza/utils/datasets/ner/convert_en_ontonotes.py +++ b/stanza/utils/datasets/ner/convert_ontonotes.py @@ -27,7 +27,15 @@ def process_dataset(short_name, conll_path, ner_output_path): except ImportError as e: raise ImportError("Please install the datasets package to process CoNLL03 with Stanza") - dataset = load_dataset("conll2012_ontonotesv5", "english_v12", cache_dir=conll_path) + if short_name == 'en_ontonotes': + config_name = 'english_v12' + elif short_name in ('zh_ontonotes', 'zh-hans_ontonotes'): + config_name = 'chinese_v4' + elif short_name == 'ar_ontonotes': + config_name = 'arabic_v4' + else: + raise ValueError("Unknown short name for downloading ontonotes: %s" % short_name) + dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=conll_path) datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]] write_dataset(datasets, ner_output_path, short_name) diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index 570bddcbe2..e8a7a332da 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -368,9 +368,6 @@ - Then run python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp -OntoNotes 5 contains a Chinese NER dataset - - https://catalog.ldc.upenn.edu/LDC2013T19 - en_conll03 is the classic 2003 4 class CoNLL dataset - The version we use is posted on HuggingFace - https://huggingface.co/datasets/conll2003 @@ -393,6 +390,13 @@ - then run python3 stanza/utils/datasets/ner/prepare_ner_dataset.py en_conllpp +en_ontonotes is the OntoNotes 5 on HuggingFace + - this downloads the "v12" version of the data + +zh-hans_ontonotes is the ZH split of the OntoNotes dataset + - https://catalog.ldc.upenn.edu/LDC2013T19 + + AQMAR is a small dataset of Arabic Wikipedia articles - http://www.cs.cmu.edu/~ark/ArabicNER/ - Recall-Oriented Learning of Named Entities in Arabic Wikipedia @@ -429,7 +433,6 @@ import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp import stanza.utils.datasets.ner.convert_en_conll03 as convert_en_conll03 -import stanza.utils.datasets.ner.convert_en_ontonotes as convert_en_ontonotes import stanza.utils.datasets.ner.convert_fire_2013 as convert_fire_2013 import stanza.utils.datasets.ner.convert_ijc as convert_ijc import stanza.utils.datasets.ner.convert_kk_kazNERD as convert_kk_kazNERD @@ -437,6 +440,7 @@ import stanza.utils.datasets.ner.convert_nner22 as convert_nner22 import stanza.utils.datasets.ner.convert_mr_l3cube as convert_mr_l3cube import stanza.utils.datasets.ner.convert_my_ucsy as convert_my_ucsy +import stanza.utils.datasets.ner.convert_ontonotes as convert_ontonotes import stanza.utils.datasets.ner.convert_rgai as convert_rgai import stanza.utils.datasets.ner.convert_nytk as convert_nytk import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner @@ -1089,7 +1093,13 @@ def process_en_ontonotes(paths, short_name): ner_input_path = paths['NERBASE'] ontonotes_path = os.path.join(ner_input_path, "english", "en_ontonotes") ner_output_path = paths['NER_DATA_DIR'] - convert_en_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path) + convert_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path) + +def process_zh_ontonotes(paths, short_name): + ner_input_path = paths['NERBASE'] + ontonotes_path = os.path.join(ner_input_path, "chinese", "zh_ontonotes") + ner_output_path = paths['NER_DATA_DIR'] + convert_ontonotes.process_dataset(short_name, ontonotes_path, ner_output_path) def process_en_conll03(paths, short_name): ner_input_path = paths['NERBASE'] @@ -1166,7 +1176,8 @@ def process_ar_aqmar(paths, short_name): "sv_suc3shuffle": process_sv_suc3shuffle, "tr_starlang": process_starlang, "th_lst20": process_lst20, - "th_nner22": process_nner22 + "th_nner22": process_nner22, + "zh-hans_ontonotes": process_zh_ontonotes, } def main(dataset_name):