Skip to content

Commit

Permalink
Add the zh-hans dataset as well
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Nov 4, 2023
1 parent bffc517 commit 1f34626
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,15 @@ def process_dataset(short_name, conll_path, ner_output_path):
except ImportError as e:
raise ImportError("Please install the datasets package to process CoNLL03 with Stanza")

dataset = load_dataset("conll2012_ontonotesv5", "english_v12", cache_dir=conll_path)
if short_name == 'en_ontonotes':
config_name = 'english_v12'
elif short_name in ('zh_ontonotes', 'zh-hans_ontonotes'):
config_name = 'chinese_v4'
elif short_name == 'ar_ontonotes':
config_name = 'arabic_v4'
else:
raise ValueError("Unknown short name for downloading ontonotes: %s" % short_name)
dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=conll_path)
datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
write_dataset(datasets, ner_output_path, short_name)

Expand Down
23 changes: 17 additions & 6 deletions stanza/utils/datasets/ner/prepare_ner_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,6 @@
- Then run
python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp
OntoNotes 5 contains a Chinese NER dataset
- https://catalog.ldc.upenn.edu/LDC2013T19
en_conll03 is the classic 2003 4 class CoNLL dataset
- The version we use is posted on HuggingFace
- https://huggingface.co/datasets/conll2003
Expand All @@ -393,6 +390,13 @@
- then run
python3 stanza/utils/datasets/ner/prepare_ner_dataset.py en_conllpp
en_ontonotes is the OntoNotes 5 on HuggingFace
- this downloads the "v12" version of the data
zh-hans_ontonotes is the ZH split of the OntoNotes dataset
- https://catalog.ldc.upenn.edu/LDC2013T19
AQMAR is a small dataset of Arabic Wikipedia articles
- http://www.cs.cmu.edu/~ark/ArabicNER/
- Recall-Oriented Learning of Named Entities in Arabic Wikipedia
Expand Down Expand Up @@ -429,14 +433,14 @@
import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios
import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp
import stanza.utils.datasets.ner.convert_en_conll03 as convert_en_conll03
import stanza.utils.datasets.ner.convert_en_ontonotes as convert_en_ontonotes
import stanza.utils.datasets.ner.convert_fire_2013 as convert_fire_2013
import stanza.utils.datasets.ner.convert_ijc as convert_ijc
import stanza.utils.datasets.ner.convert_kk_kazNERD as convert_kk_kazNERD
import stanza.utils.datasets.ner.convert_lst20 as convert_lst20
import stanza.utils.datasets.ner.convert_nner22 as convert_nner22
import stanza.utils.datasets.ner.convert_mr_l3cube as convert_mr_l3cube
import stanza.utils.datasets.ner.convert_my_ucsy as convert_my_ucsy
import stanza.utils.datasets.ner.convert_ontonotes as convert_ontonotes
import stanza.utils.datasets.ner.convert_rgai as convert_rgai
import stanza.utils.datasets.ner.convert_nytk as convert_nytk
import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner
Expand Down Expand Up @@ -1089,7 +1093,13 @@ def process_en_ontonotes(paths, short_name):
ner_input_path = paths['NERBASE']
ontonotes_path = os.path.join(ner_input_path, "english", "en_ontonotes")
ner_output_path = paths['NER_DATA_DIR']
convert_en_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path)
convert_ontonotes.process_dataset("en_ontonotes", ontonotes_path, ner_output_path)

def process_zh_ontonotes(paths, short_name):
ner_input_path = paths['NERBASE']
ontonotes_path = os.path.join(ner_input_path, "chinese", "zh_ontonotes")
ner_output_path = paths['NER_DATA_DIR']
convert_ontonotes.process_dataset(short_name, ontonotes_path, ner_output_path)

def process_en_conll03(paths, short_name):
ner_input_path = paths['NERBASE']
Expand Down Expand Up @@ -1166,7 +1176,8 @@ def process_ar_aqmar(paths, short_name):
"sv_suc3shuffle": process_sv_suc3shuffle,
"tr_starlang": process_starlang,
"th_lst20": process_lst20,
"th_nner22": process_nner22
"th_nner22": process_nner22,
"zh-hans_ontonotes": process_zh_ontonotes,
}

def main(dataset_name):
Expand Down

0 comments on commit 1f34626

Please sign in to comment.