diff --git a/calamancy/__init__.py b/calamancy/__init__.py index 6294c5f..02181f6 100644 --- a/calamancy/__init__.py +++ b/calamancy/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.4" +__version__ = "0.2.0" from .inference import EntityRecognizer, Parser, Tagger from .loaders import get_latest_version, load, models diff --git a/calamancy/loaders.py b/calamancy/loaders.py index ba5316b..f100761 100644 --- a/calamancy/loaders.py +++ b/calamancy/loaders.py @@ -18,6 +18,9 @@ def _get_models_url() -> Dict[str, str]: tracked and the download functions below work as expected. """ return { + "tl_calamancy_md-0.2.0": f"https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/{GIT_REF}/tl_calamancy_md-any-py3-none-any.whl", + "tl_calamancy_lg-0.2.0": f"https://huggingface.co/ljvmiranda921/tl_calamancy_lg/resolve/{GIT_REF}/tl_calamancy_lg-any-py3-none-any.whl", + "tl_calamancy_trf-0.2.0": f"https://huggingface.co/ljvmiranda921/tl_calamancy_trf/resolve/{GIT_REF}/tl_calamancy_trf-any-py3-none-any.whl", "tl_calamancy_md-0.1.0": f"https://huggingface.co/ljvmiranda921/tl_calamancy_md-0.1.0/resolve/{GIT_REF}/tl_calamancy_md-0.1.0-py3-none-any.whl", "tl_calamancy_lg-0.1.0": f"https://huggingface.co/ljvmiranda921/tl_calamancy_lg-0.1.0/resolve/{GIT_REF}/tl_calamancy_lg-0.1.0-py3-none-any.whl", "tl_calamancy_trf-0.1.0": f"https://huggingface.co/ljvmiranda921/tl_calamancy_trf-0.1.0/resolve/{GIT_REF}/tl_calamancy_trf-0.1.0-py3-none-any.whl", diff --git a/models/v0.2.0/.gitignore b/models/v0.2.0/.gitignore new file mode 100644 index 0000000..5d96699 --- /dev/null +++ b/models/v0.2.0/.gitignore @@ -0,0 +1,2 @@ +packages +models \ No newline at end of file diff --git a/models/v0.2.0/README.md b/models/v0.2.0/README.md new file mode 100644 index 0000000..807fb44 --- /dev/null +++ b/models/v0.2.0/README.md @@ -0,0 +1,124 @@ + + +# 🪐 Weasel Project: Release v0.2.0 + +This is a spaCy project that trains the v0.2.0 models for calamanCy. +Here are some of the major changes in this release: + +- **Included trainable lemmatizer in the pipeline**: instead of a rules-based +lemmatizer, we are now using the [neural edit-tree +lemmatizer](https://explosion.ai/blog/edit-tree-lemmatizer). +- **Trained on UD-NewsCrawl**: this is a major update, as we are now training +our parser, tagger, and morphologizer components on the larger +[UD-NewsCrawl](https://huggingface.co/datasets/UD-Filipino/UD_Tagalog-NewsCrawl) +treebank. Our training dataset has now increased from 150+ to 15,000! From +this point forward, we will be using the UD-TRG and UD-Ugnayan treebanks as +test sets (as intended). +- **Better evaluations**: Aside from evaluating our dependency parser and POS tagger on UD-TRG and UD-Ugnayan, we have also included Universal NER ([Mayhew et al., 2023](https://arxiv.org/abs/2311.09122)) as our test set for evaluating the NER component. +- **Improved base model for tl_calamancy_trf**: Based on internal evaluations, we are now using [mDeBERTa-v3 (base)](https://huggingface.co/microsoft/mdeberta-v3-base) as our source of context-sensitive vectors for tl_calamancy_trf. +- **Simpler pipelines, no more pretraining**: We found that pretraining doesn't really offer huge performance gains (0-1%) given the huge effort and time needed to do it. Hence, for ease of training the whole pipeline, we removed it from the calamanCy recipe. + +The namespaces for the latest models remain the same. +The legacy models will have an explicit version number in their HuggingFace repositories. +Please see [this HuggingFace collection](https://huggingface.co/collections/ljvmiranda921/calamancy-models-for-tagalog-nlp-65629cc46ef2a1d0f9605c87) for more information. + +## Set-up + +You can use this project to replicate the pipelines shipped by the project. +First, you need to install the required dependencies: + +```sh +pip install -r requirements.txt +``` + +Then run the set-up commands: + +```sh +python -m spacy project assets +python -m spacy project run setup +``` + +This step downloads all assets and prepares all the datasets and binaries for +training use. For example, if you want to train `tl_calamancy_md`, run the following comand: + +```sh +MODEL=tl_calamancy_md scripts/train.sh +``` + + +## Model information + +The table below shows an overview of the calamanCy models in this project. For more information, +I suggest checking the [language pipeline metadata](https://spacy.io/api/language#meta). + + +| Model | Pipelines | Description | +|-----------------------------|---------------------------------------------|--------------------------------------------------------------------------------------------------------------| +| tl_calamancy_md (214 MB) | tok2vec, tagger, trainable_lemmatizer, morphologizer, parser, ner | CPU-optimized Tagalog NLP model. Pretrained using the TLUnified dataset. Using floret vectors (50k keys) | +| tl_calamancy_lg (482 MB) | tok2vec, tagger, trainable_lemmatizer, morphologizer, parser, ner | CPU-optimized large Tagalog NLP model. Pretrained using the TLUnified dataset. Using fastText vectors (714k) | +| tl_calamancy_trf (1.7 GB) | transformer, tagger, trainable_lemmatizer, morphologizer, parser, ner | GPU-optimized transformer Tagalog NLP model. Uses mdeberta-v3-base as context vectors. | + + +## 📋 project.yml + +The [`project.yml`](project.yml) defines the data assets required by the +project, as well as the available commands and workflows. For details, see the +[Weasel documentation](https://github.com/explosion/weasel). + +### ⏯ Commands + +The following commands are defined by the project. They +can be executed using [`weasel run [name]`](https://github.com/explosion/weasel/tree/main/docs/cli.md#rocket-run). +Commands are only re-run if their inputs have changed. + +| Command | Description | +| --- | --- | +| `setup-finetuning-data` | Prepare the Tagalog corpora used for training various spaCy components | +| `setup-fasttext-vectors` | Make fastText vectors spaCy compatible | +| `build-floret` | Build floret binary for training fastText / floret vectors | +| `train-vectors-md` | Train medium-sized word vectors (200 dims, 200k keys) using the floret binary. | +| `train-parser` | Train a trainable_lemmatizer, parser, tagger, and morphologizer using the Universal Dependencies treebanks | +| `train-parser-trf` | Train a trainable_lemmatizer, parser, tagger, and morphologizer using the Universal Dependencies treebanks | +| `train-ner` | Train ner component | +| `train-ner-trf` | Train ner component | +| `assemble` | Assemble pipelines to create a single spaCy piepline | +| `assemble-trf` | Assemble pipelines to create a single spaCy piepline | +| `setup-eval-data` | Convert remaining test datasets | +| `evaluate-model` | Evaluate a model | + +### ⏭ Workflows + +The following workflows are defined by the project. They +can be executed using [`weasel run [name]`](https://github.com/explosion/weasel/tree/main/docs/cli.md#rocket-run) +and will run the specified commands in order. Commands are only re-run if their +inputs have changed. + +| Workflow | Steps | +| --- | --- | +| `setup` | `setup-finetuning-data` → `setup-fasttext-vectors` → `build-floret` → `train-vectors-md` | +| `tl-calamancy` | `train-parser` → `train-ner` → `assemble` | +| `tl-calamancy-trf` | `train-parser-trf` → `train-ner-trf` → `assemble-trf` | +| `evaluate` | `setup-eval-data` → `evaluate-model` | + +### 🗂 Assets + +The following assets are defined by the project. They can +be fetched by running [`weasel assets`](https://github.com/explosion/weasel/tree/main/docs/cli.md#open_file_folder-assets) +in the project directory. + +| File | Source | Description | +| --- | --- | --- | +| `assets/tlunified_raw_text.txt` | URL | Pre-converted raw text from TLUnified in JSONL format (1.1 GB). | +| `assets/corpus.tar.gz` | URL | Annotated TLUnified corpora in spaCy format with train, dev, and test splits. | +| `assets/tl_newscrawl-ud-train.conllu` | URL | Train dataset for NewsCrawl | +| `assets/tl_newscrawl-ud-dev.conllu` | URL | Dev dataset for NewsCrawl | +| `assets/tl_newscrawl-ud-test.conllu` | URL | Test dataset for NewsCrawl | +| `assets/tl_trg-ud-test.conllu` | URL | Test dataset for TRG | +| `assets/tl_ugnayan-ud-test.conllu` | URL | Test dataset for Ugnayan | +| `assets/uner_trg.iob2` | URL | Test dataset for Universal NER TRG | +| `assets/uner_ugnayan.iob2` | URL | Test dataset for Universal NER Ugnayan | +| `assets/tfnerd.txt` | URL | Test dataset for TF-NERD | +| `assets/fasttext.tl.gz` | URL | Tagalog fastText vectors provided from the fastText website (trained from CommonCrawl and Wikipedia). | +| `assets/floret` | Git | Floret repository for training floret and fastText models. | + + \ No newline at end of file diff --git a/models/v0.2.0/configs/assemble.cfg b/models/v0.2.0/configs/assemble.cfg new file mode 100644 index 0000000..7e57086 --- /dev/null +++ b/models/v0.2.0/configs/assemble.cfg @@ -0,0 +1,38 @@ +[paths] +parser_model = null +ner_model = null + +[nlp] +lang = "tl" +pipeline = ["tok2vec", "trainable_lemmatizer", "tagger", "morphologizer", "parser", "ner"] +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[initialize] +vectors = ${paths.parser_model} + +[components] + +[components.tok2vec] +source = ${paths.parser_model} +component = "tok2vec" + +[components.trainable_lemmatizer] +source = ${paths.parser_model} +component = "trainable_lemmatizer" + +[components.tagger] +source = ${paths.parser_model} +component = "tagger" + +[components.morphologizer] +source = ${paths.parser_model} +component = "morphologizer" + +[components.parser] +source = ${paths.parser_model} +component = "parser" + +[components.ner] +source = ${paths.ner_model} +component = "ner" +replace_listeners = ["model.tok2vec"] \ No newline at end of file diff --git a/models/v0.2.0/configs/assemble_trf.cfg b/models/v0.2.0/configs/assemble_trf.cfg new file mode 100644 index 0000000..ba23f3b --- /dev/null +++ b/models/v0.2.0/configs/assemble_trf.cfg @@ -0,0 +1,35 @@ +[paths] +parser_model = null +ner_model = null + +[nlp] +lang = "tl" +pipeline = ["transformer", "trainable_lemmatizer", "tagger", "morphologizer", "parser", "ner"] +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.transformer] +source = ${paths.parser_model} +component = "transformer" + +[components.trainable_lemmatizer] +source = ${paths.parser_model} +component = "trainable_lemmatizer" + +[components.tagger] +source = ${paths.parser_model} +component = "tagger" + +[components.morphologizer] +source = ${paths.parser_model} +component = "morphologizer" + +[components.parser] +source = ${paths.parser_model} +component = "parser" + +[components.ner] +source = ${paths.ner_model} +component = "ner" +replace_listeners = ["model.tok2vec"] \ No newline at end of file diff --git a/models/v0.2.0/configs/ner.cfg b/models/v0.2.0/configs/ner.cfg new file mode 100644 index 0000000..1d10070 --- /dev/null +++ b/models/v0.2.0/configs/ner.cfg @@ -0,0 +1,145 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +gpu_allocator = null +seed = 0 + +[nlp] +lang = "tl" +pipeline = ["tok2vec","ner"] +batch_size = 1000 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +vectors = {"@vectors":"spacy.Vectors.v1"} + +[components] + +[components.ner] +factory = "ner" +incorrect_spans_key = null +moves = null +scorer = {"@scorers":"spacy.ner_scorer.v1"} +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = true +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.tok2vec.model.encode.width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,1000,2500,2500] +include_static_vectors = true + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 256 +depth = 8 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +ents_f = 1.0 +ents_p = 0.0 +ents_r = 0.0 +ents_per_type = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/models/v0.2.0/configs/ner_trf.cfg b/models/v0.2.0/configs/ner_trf.cfg new file mode 100644 index 0000000..5ded721 --- /dev/null +++ b/models/v0.2.0/configs/ner_trf.cfg @@ -0,0 +1,147 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "tl" +pipeline = ["transformer","ner"] +batch_size = 128 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +vectors = {"@vectors":"spacy.Vectors.v1"} + +[components] + +[components.ner] +factory = "ner" +incorrect_spans_key = null +moves = null +scorer = {"@scorers":"spacy.ner_scorer.v1"} +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = false +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.transformer] +factory = "transformer" +max_batch_items = 4096 +set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v3" +name = "bert-base-multilingual-uncased" +mixed_precision = false + +[components.transformer.model.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + +[components.transformer.model.grad_scaler_config] + +[components.transformer.model.tokenizer_config] +use_fast = true + +[components.transformer.model.transformer_config] + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +accumulate_gradient = 3 +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +size = 2000 +buffer = 256 +get_length = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] +ents_f = 1.0 +ents_p = 0.0 +ents_r = 0.0 +ents_per_type = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/models/v0.2.0/configs/parser.cfg b/models/v0.2.0/configs/parser.cfg new file mode 100644 index 0000000..57c1091 --- /dev/null +++ b/models/v0.2.0/configs/parser.cfg @@ -0,0 +1,205 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +gpu_allocator = null +seed = 0 + +[nlp] +lang = "tl" +pipeline = ["tok2vec","trainable_lemmatizer","morphologizer","tagger","parser"] +batch_size = 1000 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +vectors = {"@vectors":"spacy.Vectors.v1"} + +[components] + +[components.morphologizer] +factory = "morphologizer" +extend = false +label_smoothing = 0.05 +overwrite = true +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.morphologizer.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +scorer = {"@scorers":"spacy.parser_scorer.v1"} +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = true +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tagger] +factory = "tagger" +label_smoothing = 0.05 +neg_prefix = "!" +overwrite = false +scorer = {"@scorers":"spacy.tagger_scorer.v1"} + +[components.tagger.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.tok2vec.model.encode.width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,1000,2500,2500] +include_static_vectors = true + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 256 +depth = 8 +window_size = 1 +maxout_pieces = 3 + +[components.trainable_lemmatizer] +factory = "trainable_lemmatizer" +backoff = "orth" +min_tree_freq = 3 +overwrite = false +scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"} +top_k = 1 + +[components.trainable_lemmatizer.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.trainable_lemmatizer.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +lemma_acc = 0.25 +pos_acc = 0.12 +morph_acc = 0.12 +morph_per_feat = null +tag_acc = 0.25 +dep_uas = 0.12 +dep_las = 0.12 +dep_las_per_type = null +sents_p = null +sents_r = null +sents_f = 0.0 + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/models/v0.2.0/configs/parser_trf.cfg b/models/v0.2.0/configs/parser_trf.cfg new file mode 100644 index 0000000..4a224f8 --- /dev/null +++ b/models/v0.2.0/configs/parser_trf.cfg @@ -0,0 +1,210 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "tl" +pipeline = ["transformer","trainable_lemmatizer","morphologizer","tagger","parser"] +batch_size = 128 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +vectors = {"@vectors":"spacy.Vectors.v1"} + +[components] + +[components.morphologizer] +factory = "morphologizer" +extend = false +label_smoothing = 0.0 +overwrite = true +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.morphologizer.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +scorer = {"@scorers":"spacy.parser_scorer.v1"} +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = false +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.tagger] +factory = "tagger" +label_smoothing = 0.0 +neg_prefix = "!" +overwrite = false +scorer = {"@scorers":"spacy.tagger_scorer.v1"} + +[components.tagger.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.tagger.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.trainable_lemmatizer] +factory = "trainable_lemmatizer" +backoff = "orth" +min_tree_freq = 3 +overwrite = false +scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"} +top_k = 1 + +[components.trainable_lemmatizer.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.trainable_lemmatizer.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.transformer] +factory = "transformer" +max_batch_items = 4096 +set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v3" +name = "bert-base-multilingual-uncased" +mixed_precision = false + +[components.transformer.model.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + +[components.transformer.model.grad_scaler_config] + +[components.transformer.model.tokenizer_config] +use_fast = true + +[components.transformer.model.transformer_config] + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +accumulate_gradient = 3 +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +size = 2000 +buffer = 256 +get_length = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] +lemma_acc = 0.25 +pos_acc = 0.12 +morph_acc = 0.12 +morph_per_feat = null +tag_acc = 0.25 +dep_uas = 0.12 +dep_las = 0.12 +dep_las_per_type = null +sents_p = null +sents_r = null +sents_f = 0.0 + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_lg/dep_newscrawl.json b/models/v0.2.0/evals/tl_calamancy_lg/dep_newscrawl.json new file mode 100644 index 0000000..4f1a17b --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_lg/dep_newscrawl.json @@ -0,0 +1,287 @@ +{ + "tokenizer":{ + "token_acc":0.9977388853, + "token_p":0.9312562401, + "token_r":0.9697572014, + "token_f":0.9501168434 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.8978697212 + }, + "tagger":{ + "tag_acc":0.9062184874 + }, + "morphologizer":{ + "pos_acc":0.9499112661, + "morph_acc":0.9503589524, + "morph_micro_p":0.9564037249, + "morph_micro_r":0.9267216962, + "morph_micro_f":0.9413287849, + "morph_per_feat":{ + "Aspect":{ + "p":0.9066862601, + "r":0.8773551369, + "f":0.8917795845 + }, + "Mood":{ + "p":0.9299410029, + "r":0.8965517241, + "f":0.9129411765 + }, + "Voice":{ + "p":0.816947909, + "r":0.7925266904, + "f":0.8045520231 + }, + "Case":{ + "p":0.9935773924, + "r":0.9861040286, + "f":0.9898266044 + }, + "Number":{ + "p":0.9934378418, + "r":0.9919912632, + "f":0.9927140255 + }, + "Person":{ + "p":0.9976726144, + "r":0.986953185, + "f":0.9922839506 + }, + "PronType":{ + "p":0.9955201593, + "r":0.9652509653, + "f":0.9801519235 + }, + "NumType":{ + "p":0.9887323944, + "r":0.858190709, + "f":0.9188481675 + }, + "Deixis":{ + "p":0.9856373429, + "r":0.9134775374, + "f":0.9481865285 + }, + "Abbr":{ + "p":1.0, + "r":0.0220588235, + "f":0.0431654676 + }, + "Polarity":{ + "p":0.9949874687, + "r":0.9612590799, + "f":0.9778325123 + }, + "Typo":{ + "p":0.8461538462, + "r":0.2340425532, + "f":0.3666666667 + }, + "Degree":{ + "p":0.9545454545, + "r":0.6, + "f":0.7368421053 + }, + "Clusivity":{ + "p":1.0, + "r":0.9947089947, + "f":0.9973474801 + }, + "PartType":{ + "p":1.0, + "r":0.9166666667, + "f":0.9565217391 + }, + "Polite":{ + "p":1.0, + "r":0.972972973, + "f":0.9863013699 + } + } + }, + "parser":{ + "sents_p":0.9929846939, + "sents_r":0.9961612284, + "sents_f":0.9945704248, + "dep_uas":0.8290118896, + "dep_las":0.7649567646, + "dep_las_per_type":{ + "root":{ + "p":0.8471867008, + "r":0.8477287268, + "f":0.8474576271 + }, + "advmod":{ + "p":0.8158415842, + "r":0.8155186065, + "f":0.8156800634 + }, + "case":{ + "p":0.8995983936, + "r":0.9016100179, + "f":0.9006030824 + }, + "compound":{ + "p":0.5083333333, + "r":0.1986970684, + "f":0.2857142857 + }, + "obj":{ + "p":0.7595356551, + "r":0.7633333333, + "f":0.7614297589 + }, + "det":{ + "p":0.946506986, + "r":0.9243664717, + "f":0.9353057199 + }, + "nsubj":{ + "p":0.7877620881, + "r":0.8071021482, + "f":0.7973148549 + }, + "nmod:poss":{ + "p":0.8170212766, + "r":0.739172281, + "f":0.7761495705 + }, + "obl":{ + "p":0.6779953917, + "r":0.6783861671, + "f":0.6781907231 + }, + "cc":{ + "p":0.8476658477, + "r":0.8414634146, + "f":0.8445532436 + }, + "conj":{ + "p":0.6095661846, + "r":0.6192090395, + "f":0.6143497758 + }, + "fixed":{ + "p":0.7301587302, + "r":0.7155555556, + "f":0.7227833895 + }, + "amod":{ + "p":0.7474358974, + "r":0.7581274382, + "f":0.7527437056 + }, + "mark":{ + "p":0.8023097826, + "r":0.7739187418, + "f":0.7878585724 + }, + "acl:relcl":{ + "p":0.6583333333, + "r":0.6379542396, + "f":0.6479835954 + }, + "nmod":{ + "p":0.5499557913, + "r":0.6000964785, + "f":0.5739331027 + }, + "ccomp":{ + "p":0.4904458599, + "r":0.5892857143, + "f":0.5353418308 + }, + "advcl":{ + "p":0.48, + "r":0.5428571429, + "f":0.5094972067 + }, + "flat":{ + "p":0.7911212294, + "r":0.8403869407, + "f":0.8150102609 + }, + "discourse":{ + "p":0.747300216, + "r":0.7284210526, + "f":0.7377398721 + }, + "parataxis":{ + "p":0.298245614, + "r":0.3109756098, + "f":0.3044776119 + }, + "nummod":{ + "p":0.8362720403, + "r":0.8279301746, + "f":0.8320802005 + }, + "obj:agent":{ + "p":0.7804568528, + "r":0.8401639344, + "f":0.8092105263 + }, + "dep":{ + "p":0.1724137931, + "r":0.3086419753, + "f":0.2212389381 + }, + "compound:redup":{ + "p":0.3636363636, + "r":0.3555555556, + "f":0.3595505618 + }, + "xcomp":{ + "p":0.6303317536, + "r":0.5518672199, + "f":0.5884955752 + }, + "dislocated":{ + "p":0.253968254, + "r":0.2025316456, + "f":0.2253521127 + }, + "appos":{ + "p":0.5118483412, + "r":0.577540107, + "f":0.5427135678 + }, + "acl":{ + "p":0.25, + "r":0.06, + "f":0.0967741935 + }, + "list":{ + "p":0.15625, + "r":0.3125, + "f":0.2083333333 + }, + "vocative":{ + "p":0.5454545455, + "r":0.4, + "f":0.4615384615 + }, + "goeswith":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "orphan":{ + "p":0.0, + "r":0.0, + "f":0.0 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":6844.0816094907 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_lg/dep_trg.json b/models/v0.2.0/evals/tl_calamancy_lg/dep_trg.json new file mode 100644 index 0000000..8912d07 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_lg/dep_trg.json @@ -0,0 +1,262 @@ +{ + "tokenizer":{ + "token_acc":1.0, + "token_p":1.0, + "token_r":1.0, + "token_f":1.0 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.7888283379 + }, + "tagger":{ + "tag_acc":0.5667574932 + }, + "morphologizer":{ + "pos_acc":0.7792915531, + "morph_acc":0.7152588556, + "morph_micro_p":0.8235294118, + "morph_micro_r":0.7139303483, + "morph_micro_f":0.764823451, + "morph_per_feat":{ + "Aspect":{ + "p":0.6630434783, + "r":0.6354166667, + "f":0.6489361702 + }, + "Mood":{ + "p":0.7282608696, + "r":0.6979166667, + "f":0.7127659574 + }, + "Voice":{ + "p":0.8369565217, + "r":0.7777777778, + "f":0.8062827225 + }, + "Case":{ + "p":0.8616071429, + "r":0.8772727273, + "f":0.8693693694 + }, + "Number":{ + "p":0.7638888889, + "r":0.9649122807, + "f":0.8527131783 + }, + "Person":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "PronType":{ + "p":0.9818181818, + "r":0.675, + "f":0.8 + }, + "Foreign":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Degree":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Deixis":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "Gender":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Link":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Polarity":{ + "p":0.8181818182, + "r":0.5, + "f":0.6206896552 + }, + "Clusivity":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "Reflex":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "PartType":{ + "p":1.0, + "r":0.3846153846, + "f":0.5555555556 + } + } + }, + "parser":{ + "sents_p":1.0, + "sents_r":1.0, + "sents_f":1.0, + "dep_uas":0.9428334714, + "dep_las":0.676056338, + "dep_las_per_type":{ + "root":{ + "p":0.9765625, + "r":0.9765625, + "f":0.9765625 + }, + "case":{ + "p":0.9864864865, + "r":0.4124293785, + "f":0.5816733068 + }, + "nsubj":{ + "p":0.6982758621, + "r":0.8804347826, + "f":0.7788461538 + }, + "det":{ + "p":0.0761904762, + "r":0.7272727273, + "f":0.1379310345 + }, + "obj:agent":{ + "p":0.625, + "r":0.8928571429, + "f":0.7352941176 + }, + "nsubj:pass":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "obj":{ + "p":0.84, + "r":0.6363636364, + "f":0.724137931 + }, + "advmod":{ + "p":0.7179487179, + "r":0.7368421053, + "f":0.7272727273 + }, + "csubj":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nmod:poss":{ + "p":0.25, + "r":1.0, + "f":0.4 + }, + "mark":{ + "p":0.875, + "r":1.0, + "f":0.9333333333 + }, + "compound:redup":{ + "p":1.0, + "r":0.5, + "f":0.6666666667 + }, + "fixed":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "advcl":{ + "p":0.7142857143, + "r":0.8333333333, + "f":0.7692307692 + }, + "obl":{ + "p":0.8888888889, + "r":0.8888888889, + "f":0.8888888889 + }, + "iobj":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:lfoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nmod":{ + "p":0.1428571429, + "r":0.1428571429, + "f":0.1428571429 + }, + "flat":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "acl:relcl":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "dislocated":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:ifoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "ccomp":{ + "p":0.2, + "r":0.5, + "f":0.2857142857 + }, + "aux":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "iobj:patient":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:bfoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "xcomp":{ + "p":1.0, + "r":0.3333333333, + "f":0.5 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":209.7349308484 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_lg/dep_ugnayan.json b/models/v0.2.0/evals/tl_calamancy_lg/dep_ugnayan.json new file mode 100644 index 0000000..78e2f97 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_lg/dep_ugnayan.json @@ -0,0 +1,186 @@ +{ + "tokenizer":{ + "token_acc":0.9950884086, + "token_p":0.9774066798, + "token_r":0.9841740851, + "token_f":0.9807787087 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.8229475767 + }, + "tagger":{ + "tag_acc":0.4858420268 + }, + "morphologizer":{ + "pos_acc":0.8166915052, + "morph_acc":0.5894632207, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":1.0, + "sents_r":1.0, + "sents_f":1.0, + "dep_uas":0.8091690544, + "dep_las":0.5846681922, + "dep_las_per_type":{ + "vocative":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "advmod":{ + "p":0.7222222222, + "r":0.8125, + "f":0.7647058824 + }, + "mark":{ + "p":0.7391304348, + "r":0.3269230769, + "f":0.4533333333 + }, + "nsubj":{ + "p":0.8139534884, + "r":0.7692307692, + "f":0.790960452 + }, + "root":{ + "p":0.8829787234, + "r":0.8829787234, + "f":0.8829787234 + }, + "case":{ + "p":0.7984496124, + "r":0.591954023, + "f":0.6798679868 + }, + "obl":{ + "p":0.5471698113, + "r":0.6170212766, + "f":0.58 + }, + "nmod":{ + "p":0.5882352941, + "r":0.4395604396, + "f":0.5031446541 + }, + "parataxis":{ + "p":0.4545454545, + "r":0.4545454545, + "f":0.4545454545 + }, + "obj":{ + "p":0.5135135135, + "r":0.3454545455, + "f":0.4130434783 + }, + "dislocated":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nmod:poss":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "discourse":{ + "p":0.1538461538, + "r":0.5, + "f":0.2352941176 + }, + "advcl":{ + "p":0.3, + "r":0.5454545455, + "f":0.3870967742 + }, + "ccomp":{ + "p":0.1111111111, + "r":0.5, + "f":0.1818181818 + }, + "det":{ + "p":0.1891891892, + "r":0.56, + "f":0.2828282828 + }, + "obj:agent":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "cc":{ + "p":0.85, + "r":0.7727272727, + "f":0.8095238095 + }, + "conj":{ + "p":0.7894736842, + "r":0.7142857143, + "f":0.75 + }, + "nummod":{ + "p":0.75, + "r":1.0, + "f":0.8571428571 + }, + "amod":{ + "p":0.6666666667, + "r":0.5581395349, + "f":0.6075949367 + }, + "fixed":{ + "p":1.0, + "r":0.6428571429, + "f":0.7826086957 + }, + "acl":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "acl:relcl":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "dep":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound:redup":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "xcomp":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "flat":{ + "p":0.2857142857, + "r":1.0, + "f":0.4444444444 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":216.6869950178 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_lg/ner_tfnerd.json b/models/v0.2.0/evals/tl_calamancy_lg/ner_tfnerd.json new file mode 100644 index 0000000..178485a --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_lg/ner_tfnerd.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.6725826193, + "ents_r":0.7058445729, + "ents_f":0.6888122846, + "ents_per_type":{ + "LOC":{ + "p":0.6135957066, + "r":0.6751968504, + "f":0.6429240862 + }, + "PER":{ + "p":0.7710997442, + "r":0.8688760807, + "f":0.8170731707 + }, + "ORG":{ + "p":0.5221843003, + "r":0.4309859155, + "f":0.4722222222 + } + } + }, + "speed":5675.9653792598 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_lg/ner_tlunified.json b/models/v0.2.0/evals/tl_calamancy_lg/ner_tlunified.json new file mode 100644 index 0000000..fed31f4 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_lg/ner_tlunified.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":1.0, + "token_p":1.0, + "token_r":1.0, + "token_f":1.0 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.8705730844, + "ents_r":0.8562381254, + "ents_f":0.8633461047, + "ents_per_type":{ + "LOC":{ + "p":0.8582887701, + "r":0.8381201044, + "f":0.8480845443 + }, + "ORG":{ + "p":0.7520215633, + "r":0.7685950413, + "f":0.7602179837 + }, + "PER":{ + "p":0.9306930693, + "r":0.9027611044, + "f":0.9165143205 + } + } + }, + "speed":5518.7028291652 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_lg/ner_uner-trg.json b/models/v0.2.0/evals/tl_calamancy_lg/ner_uner-trg.json new file mode 100644 index 0000000..ae6a4c7 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_lg/ner_uner-trg.json @@ -0,0 +1,51 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":1.0, + "ents_r":0.9565217391, + "ents_f":0.9777777778, + "ents_per_type":{ + "PER":{ + "p":1.0, + "r":0.9473684211, + "f":0.972972973 + }, + "LOC":{ + "p":1.0, + "r":1.0, + "f":1.0 + } + } + }, + "speed":190.833354765 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_lg/ner_uner-ugnayan.json b/models/v0.2.0/evals/tl_calamancy_lg/ner_uner-ugnayan.json new file mode 100644 index 0000000..33bebed --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_lg/ner_uner-ugnayan.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.6046511628, + "ents_r":0.7878787879, + "ents_f":0.6842105263, + "ents_per_type":{ + "PER":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "LOC":{ + "p":0.8666666667, + "r":0.8387096774, + "f":0.8524590164 + }, + "ORG":{ + "p":0.0, + "r":0.0, + "f":0.0 + } + } + }, + "speed":289.2947023435 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_md/dep_newscrawl.json b/models/v0.2.0/evals/tl_calamancy_md/dep_newscrawl.json new file mode 100644 index 0000000..f8b627d --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_md/dep_newscrawl.json @@ -0,0 +1,287 @@ +{ + "tokenizer":{ + "token_acc":0.9977388853, + "token_p":0.9312562401, + "token_r":0.9697572014, + "token_f":0.9501168434 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.9009458039 + }, + "tagger":{ + "tag_acc":0.9085019253 + }, + "morphologizer":{ + "pos_acc":0.9499709311, + "morph_acc":0.9534146752, + "morph_micro_p":0.9632699428, + "morph_micro_r":0.926397993, + "morph_micro_f":0.9444742379, + "morph_per_feat":{ + "Aspect":{ + "p":0.9350073855, + "r":0.9001066477, + "f":0.9172251404 + }, + "Mood":{ + "p":0.9492216457, + "r":0.9104159261, + "f":0.9294138995 + }, + "Voice":{ + "p":0.8260066494, + "r":0.7957295374, + "f":0.8105854631 + }, + "Case":{ + "p":0.9936708861, + "r":0.9807496175, + "f":0.9871679713 + }, + "Number":{ + "p":0.9937956204, + "r":0.9912631962, + "f":0.992527793 + }, + "Person":{ + "p":0.99844479, + "r":0.9854182655, + "f":0.9918887601 + }, + "PronType":{ + "p":0.99695586, + "r":0.9483590734, + "f":0.9720504576 + }, + "NumType":{ + "p":0.985915493, + "r":0.8557457213, + "f":0.9162303665 + }, + "Deixis":{ + "p":0.9922928709, + "r":0.8569051581, + "f":0.9196428571 + }, + "Abbr":{ + "p":1.0, + "r":0.0220588235, + "f":0.0431654676 + }, + "Polarity":{ + "p":1.0, + "r":0.98062954, + "f":0.9902200489 + }, + "Typo":{ + "p":1.0, + "r":0.085106383, + "f":0.1568627451 + }, + "Degree":{ + "p":0.9545454545, + "r":0.6, + "f":0.7368421053 + }, + "Clusivity":{ + "p":1.0, + "r":0.9894179894, + "f":0.9946808511 + }, + "PartType":{ + "p":1.0, + "r":0.8055555556, + "f":0.8923076923 + }, + "Polite":{ + "p":1.0, + "r":1.0, + "f":1.0 + } + } + }, + "parser":{ + "sents_p":0.9987212276, + "sents_r":0.9993602047, + "sents_f":0.999040614, + "dep_uas":0.8345386513, + "dep_las":0.7712571247, + "dep_las_per_type":{ + "root":{ + "p":0.851732991, + "r":0.8490083173, + "f":0.8503684716 + }, + "advmod":{ + "p":0.8110850898, + "r":0.8226444972, + "f":0.8168238994 + }, + "case":{ + "p":0.9082693177, + "r":0.8989266547, + "f":0.9035738368 + }, + "compound":{ + "p":0.4714285714, + "r":0.1074918567, + "f":0.175066313 + }, + "obj":{ + "p":0.7445141066, + "r":0.7916666667, + "f":0.7673667205 + }, + "det":{ + "p":0.9482343499, + "r":0.9212475634, + "f":0.9345461736 + }, + "nsubj":{ + "p":0.8073630137, + "r":0.8268303376, + "f":0.8169807234 + }, + "nmod:poss":{ + "p":0.7756010686, + "r":0.8383060635, + "f":0.8057354302 + }, + "obl":{ + "p":0.7063989962, + "r":0.6489913545, + "f":0.6764794233 + }, + "cc":{ + "p":0.855, + "r":0.8341463415, + "f":0.8444444444 + }, + "conj":{ + "p":0.6396713615, + "r":0.615819209, + "f":0.6275187104 + }, + "fixed":{ + "p":0.6900826446, + "r":0.7422222222, + "f":0.7152034261 + }, + "amod":{ + "p":0.7519181586, + "r":0.7646293888, + "f":0.7582205029 + }, + "mark":{ + "p":0.799461642, + "r":0.7785058978, + "f":0.7888446215 + }, + "acl:relcl":{ + "p":0.6479791395, + "r":0.668909825, + "f":0.6582781457 + }, + "nmod":{ + "p":0.5643872114, + "r":0.6131210806, + "f":0.5877456647 + }, + "ccomp":{ + "p":0.5508684864, + "r":0.5663265306, + "f":0.558490566 + }, + "advcl":{ + "p":0.471, + "r":0.5607142857, + "f":0.5119565217 + }, + "flat":{ + "p":0.8091647332, + "r":0.8434099154, + "f":0.8259325044 + }, + "discourse":{ + "p":0.7787418655, + "r":0.7557894737, + "f":0.7670940171 + }, + "parataxis":{ + "p":0.2716763006, + "r":0.2865853659, + "f":0.2789317507 + }, + "nummod":{ + "p":0.8320802005, + "r":0.8279301746, + "f":0.83 + }, + "obj:agent":{ + "p":0.8516483516, + "r":0.8469945355, + "f":0.8493150685 + }, + "appos":{ + "p":0.5021834061, + "r":0.614973262, + "f":0.5528846154 + }, + "dep":{ + "p":0.075, + "r":0.1851851852, + "f":0.1067615658 + }, + "xcomp":{ + "p":0.6181102362, + "r":0.6514522822, + "f":0.6343434343 + }, + "compound:redup":{ + "p":0.3684210526, + "r":0.4666666667, + "f":0.4117647059 + }, + "acl":{ + "p":0.1666666667, + "r":0.02, + "f":0.0357142857 + }, + "list":{ + "p":0.1142857143, + "r":0.25, + "f":0.1568627451 + }, + "dislocated":{ + "p":0.4090909091, + "r":0.2278481013, + "f":0.2926829268 + }, + "vocative":{ + "p":0.5, + "r":0.5333333333, + "f":0.5161290323 + }, + "goeswith":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "orphan":{ + "p":0.0, + "r":0.0, + "f":0.0 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":2256.6370035952 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_md/dep_trg.json b/models/v0.2.0/evals/tl_calamancy_md/dep_trg.json new file mode 100644 index 0000000..24bf5fa --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_md/dep_trg.json @@ -0,0 +1,262 @@ +{ + "tokenizer":{ + "token_acc":1.0, + "token_p":1.0, + "token_r":1.0, + "token_f":1.0 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.7983651226 + }, + "tagger":{ + "tag_acc":0.5817438692 + }, + "morphologizer":{ + "pos_acc":0.7820163488, + "morph_acc":0.7316076294, + "morph_micro_p":0.8465991317, + "morph_micro_r":0.7276119403, + "morph_micro_f":0.7826086957, + "morph_per_feat":{ + "Aspect":{ + "p":0.7555555556, + "r":0.7083333333, + "f":0.7311827957 + }, + "Mood":{ + "p":0.7555555556, + "r":0.7083333333, + "f":0.7311827957 + }, + "Voice":{ + "p":0.8888888889, + "r":0.8080808081, + "f":0.8465608466 + }, + "Case":{ + "p":0.8616071429, + "r":0.8772727273, + "f":0.8693693694 + }, + "Number":{ + "p":0.7638888889, + "r":0.9649122807, + "f":0.8527131783 + }, + "Person":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "PronType":{ + "p":0.9818181818, + "r":0.675, + "f":0.8 + }, + "Foreign":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Degree":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Deixis":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "Gender":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Link":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Polarity":{ + "p":0.8461538462, + "r":0.6111111111, + "f":0.7096774194 + }, + "Clusivity":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "Reflex":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "PartType":{ + "p":1.0, + "r":0.2307692308, + "f":0.375 + } + } + }, + "parser":{ + "sents_p":1.0, + "sents_r":1.0, + "sents_f":1.0, + "dep_uas":0.9328914664, + "dep_las":0.6694283347, + "dep_las_per_type":{ + "root":{ + "p":0.96875, + "r":0.96875, + "f":0.96875 + }, + "case":{ + "p":0.9864864865, + "r":0.4124293785, + "f":0.5816733068 + }, + "nsubj":{ + "p":0.6666666667, + "r":0.847826087, + "f":0.7464114833 + }, + "det":{ + "p":0.0761904762, + "r":0.7272727273, + "f":0.1379310345 + }, + "obj:agent":{ + "p":0.6857142857, + "r":0.8571428571, + "f":0.7619047619 + }, + "nsubj:pass":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "obj":{ + "p":0.7857142857, + "r":0.6666666667, + "f":0.7213114754 + }, + "advmod":{ + "p":0.6666666667, + "r":0.6842105263, + "f":0.6753246753 + }, + "csubj":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nmod:poss":{ + "p":0.25, + "r":1.0, + "f":0.4 + }, + "mark":{ + "p":0.875, + "r":1.0, + "f":0.9333333333 + }, + "compound:redup":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "amod":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "fixed":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "advcl":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "obl":{ + "p":0.8518518519, + "r":0.8518518519, + "f":0.8518518519 + }, + "iobj":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:lfoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nmod":{ + "p":0.1111111111, + "r":0.1428571429, + "f":0.125 + }, + "flat":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "acl:relcl":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "parataxis":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:ifoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "ccomp":{ + "p":0.6666666667, + "r":1.0, + "f":0.8 + }, + "aux":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "iobj:patient":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:bfoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "xcomp":{ + "p":1.0, + "r":0.6666666667, + "f":0.8 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":67.8066666502 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_md/dep_ugnayan.json b/models/v0.2.0/evals/tl_calamancy_md/dep_ugnayan.json new file mode 100644 index 0000000..8f1d126 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_md/dep_ugnayan.json @@ -0,0 +1,191 @@ +{ + "tokenizer":{ + "token_acc":0.9950884086, + "token_p":0.9774066798, + "token_r":0.9841740851, + "token_f":0.9807787087 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.8229475767 + }, + "tagger":{ + "tag_acc":0.4915590864 + }, + "morphologizer":{ + "pos_acc":0.8282025819, + "morph_acc":0.5941381023, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":1.0, + "sents_r":1.0, + "sents_f":1.0, + "dep_uas":0.7970183486, + "dep_las":0.5732265446, + "dep_las_per_type":{ + "vocative":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "advmod":{ + "p":0.6727272727, + "r":0.7708333333, + "f":0.7184466019 + }, + "mark":{ + "p":0.7391304348, + "r":0.3269230769, + "f":0.4533333333 + }, + "nsubj":{ + "p":0.75, + "r":0.7912087912, + "f":0.7700534759 + }, + "root":{ + "p":0.829787234, + "r":0.829787234, + "f":0.829787234 + }, + "case":{ + "p":0.8046875, + "r":0.591954023, + "f":0.6821192053 + }, + "obl":{ + "p":0.5531914894, + "r":0.5531914894, + "f":0.5531914894 + }, + "nmod":{ + "p":0.6315789474, + "r":0.3956043956, + "f":0.4864864865 + }, + "parataxis":{ + "p":0.2857142857, + "r":0.1818181818, + "f":0.2222222222 + }, + "obj":{ + "p":0.488372093, + "r":0.3818181818, + "f":0.4285714286 + }, + "dislocated":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nmod:poss":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "discourse":{ + "p":0.1538461538, + "r":0.5, + "f":0.2352941176 + }, + "advcl":{ + "p":0.2105263158, + "r":0.3636363636, + "f":0.2666666667 + }, + "acl:relcl":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "ccomp":{ + "p":0.2222222222, + "r":1.0, + "f":0.3636363636 + }, + "det":{ + "p":0.1891891892, + "r":0.56, + "f":0.2828282828 + }, + "obj:agent":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "cc":{ + "p":1.0, + "r":0.9090909091, + "f":0.9523809524 + }, + "conj":{ + "p":1.0, + "r":0.7619047619, + "f":0.8648648649 + }, + "nummod":{ + "p":0.7058823529, + "r":1.0, + "f":0.8275862069 + }, + "amod":{ + "p":0.6764705882, + "r":0.5348837209, + "f":0.5974025974 + }, + "fixed":{ + "p":0.9, + "r":0.6428571429, + "f":0.75 + }, + "acl":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound:redup":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "xcomp":{ + "p":0.2, + "r":0.3333333333, + "f":0.25 + }, + "dep":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "flat":{ + "p":0.3, + "r":1.0, + "f":0.4615384615 + }, + "appos":{ + "p":0.0, + "r":0.0, + "f":0.0 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":60.401745539 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_md/ner_tfnerd.json b/models/v0.2.0/evals/tl_calamancy_md/ner_tfnerd.json new file mode 100644 index 0000000..a748912 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_md/ner_tfnerd.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.6836228288, + "ents_r":0.7077713552, + "ents_f":0.6954875355, + "ents_per_type":{ + "LOC":{ + "p":0.5752212389, + "r":0.7677165354, + "f":0.6576728499 + }, + "PER":{ + "p":0.8238636364, + "r":0.8357348703, + "f":0.8297567954 + }, + "ORG":{ + "p":0.5739130435, + "r":0.3718309859, + "f":0.4512820513 + } + } + }, + "speed":5218.401577242 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_md/ner_tlunified.json b/models/v0.2.0/evals/tl_calamancy_md/ner_tlunified.json new file mode 100644 index 0000000..9ea6116 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_md/ner_tlunified.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":1.0, + "token_p":1.0, + "token_r":1.0, + "token_f":1.0 + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.8757206919, + "ents_r":0.8657378087, + "ents_f":0.8707006369, + "ents_per_type":{ + "LOC":{ + "p":0.7995169082, + "r":0.864229765, + "f":0.8306148055 + }, + "ORG":{ + "p":0.802259887, + "r":0.782369146, + "f":0.7921896792 + }, + "PER":{ + "p":0.948297604, + "r":0.9027611044, + "f":0.9249692497 + } + } + }, + "speed":5277.6878703929 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_md/ner_uner-trg.json b/models/v0.2.0/evals/tl_calamancy_md/ner_uner-trg.json new file mode 100644 index 0000000..c5cd0ef --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_md/ner_uner-trg.json @@ -0,0 +1,51 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":1.0, + "ents_r":0.9565217391, + "ents_f":0.9777777778, + "ents_per_type":{ + "PER":{ + "p":1.0, + "r":0.9473684211, + "f":0.972972973 + }, + "LOC":{ + "p":1.0, + "r":1.0, + "f":1.0 + } + } + }, + "speed":199.5292416132 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_md/ner_uner-ugnayan.json b/models/v0.2.0/evals/tl_calamancy_md/ner_uner-ugnayan.json new file mode 100644 index 0000000..6c96344 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_md/ner_uner-ugnayan.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "tok2vec":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.5897435897, + "ents_r":0.696969697, + "ents_f":0.6388888889, + "ents_per_type":{ + "PER":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "LOC":{ + "p":0.7931034483, + "r":0.7419354839, + "f":0.7666666667 + }, + "ORG":{ + "p":0.0, + "r":0.0, + "f":0.0 + } + } + }, + "speed":293.2061470992 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_trf/dep_newscrawl.json b/models/v0.2.0/evals/tl_calamancy_trf/dep_newscrawl.json new file mode 100644 index 0000000..64df91b --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_trf/dep_newscrawl.json @@ -0,0 +1,287 @@ +{ + "tokenizer":{ + "token_acc":0.9977388853, + "token_p":0.9312562401, + "token_r":0.9697572014, + "token_f":0.9501168434 + }, + "transformer":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.9046060533 + }, + "tagger":{ + "tag_acc":0.9133675109 + }, + "morphologizer":{ + "pos_acc":0.9542517684, + "morph_acc":0.9532372448, + "morph_micro_p":0.9628580946, + "morph_micro_r":0.9356639961, + "morph_micro_f":0.9490662836, + "morph_per_feat":{ + "Aspect":{ + "p":0.9315617037, + "r":0.9097049413, + "f":0.9205035971 + }, + "Mood":{ + "p":0.9489609916, + "r":0.925346605, + "f":0.9370050396 + }, + "Voice":{ + "p":0.8320582878, + "r":0.8128113879, + "f":0.8223222322 + }, + "Case":{ + "p":0.9933075933, + "r":0.983936767, + "f":0.9885999744 + }, + "Number":{ + "p":0.9938001459, + "r":0.9919912632, + "f":0.9928948807 + }, + "Person":{ + "p":0.9969040248, + "r":0.9884881044, + "f":0.9926782274 + }, + "PronType":{ + "p":0.996, + "r":0.9613899614, + "f":0.978388998 + }, + "NumType":{ + "p":0.9886845827, + "r":0.8545232274, + "f":0.9167213115 + }, + "Deixis":{ + "p":0.9872262774, + "r":0.9001663894, + "f":0.9416884247 + }, + "Abbr":{ + "p":1.0, + "r":0.1323529412, + "f":0.2337662338 + }, + "Polarity":{ + "p":1.0, + "r":0.9612590799, + "f":0.9802469136 + }, + "Typo":{ + "p":0.8421052632, + "r":0.3404255319, + "f":0.4848484848 + }, + "Degree":{ + "p":0.8965517241, + "r":0.7428571429, + "f":0.8125 + }, + "Clusivity":{ + "p":0.9946808511, + "r":0.9894179894, + "f":0.9920424403 + }, + "PartType":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "Polite":{ + "p":1.0, + "r":0.9459459459, + "f":0.9722222222 + } + } + }, + "parser":{ + "sents_p":0.9910714286, + "sents_r":0.9942418426, + "sents_f":0.9926541041, + "dep_uas":0.8509143472, + "dep_las":0.7882964453, + "dep_las_per_type":{ + "root":{ + "p":0.8752399232, + "r":0.8752399232, + "f":0.8752399232 + }, + "advmod":{ + "p":0.8269699431, + "r":0.8060174188, + "f":0.8163592622 + }, + "case":{ + "p":0.9138405133, + "r":0.8917710197, + "f":0.9026708918 + }, + "compound":{ + "p":0.4831460674, + "r":0.2801302932, + "f":0.3546391753 + }, + "obj":{ + "p":0.7543581616, + "r":0.7933333333, + "f":0.7733549959 + }, + "det":{ + "p":0.9561014901, + "r":0.9255360624, + "f":0.940570523 + }, + "nsubj":{ + "p":0.8303264605, + "r":0.8474353354, + "f":0.8387936646 + }, + "nmod:poss":{ + "p":0.7921830315, + "r":0.7998075072, + "f":0.7959770115 + }, + "obl":{ + "p":0.7150442478, + "r":0.6985590778, + "f":0.7067055394 + }, + "cc":{ + "p":0.87375, + "r":0.8524390244, + "f":0.862962963 + }, + "conj":{ + "p":0.7037444934, + "r":0.7220338983, + "f":0.7127718907 + }, + "fixed":{ + "p":0.7533936652, + "r":0.74, + "f":0.7466367713 + }, + "amod":{ + "p":0.7456359102, + "r":0.77763329, + "f":0.761298536 + }, + "mark":{ + "p":0.833781965, + "r":0.8119266055, + "f":0.8227091633 + }, + "acl:relcl":{ + "p":0.658974359, + "r":0.6917900404, + "f":0.674983585 + }, + "nmod":{ + "p":0.5852428964, + "r":0.6160154366, + "f":0.6002350176 + }, + "ccomp":{ + "p":0.5534188034, + "r":0.6607142857, + "f":0.6023255814 + }, + "advcl":{ + "p":0.534127844, + "r":0.5869047619, + "f":0.5592739648 + }, + "flat":{ + "p":0.8543087415, + "r":0.8331318017, + "f":0.843587389 + }, + "discourse":{ + "p":0.8162393162, + "r":0.8042105263, + "f":0.8101802757 + }, + "parataxis":{ + "p":0.4043715847, + "r":0.4512195122, + "f":0.4265129683 + }, + "nummod":{ + "p":0.8539042821, + "r":0.8453865337, + "f":0.8496240602 + }, + "obj:agent":{ + "p":0.8138020833, + "r":0.8538251366, + "f":0.8333333333 + }, + "appos":{ + "p":0.5825242718, + "r":0.6417112299, + "f":0.6106870229 + }, + "dep":{ + "p":0.2, + "r":0.3580246914, + "f":0.2566371681 + }, + "list":{ + "p":0.06, + "r":0.1875, + "f":0.0909090909 + }, + "orphan":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound:redup":{ + "p":0.4736842105, + "r":0.6, + "f":0.5294117647 + }, + "xcomp":{ + "p":0.6497695853, + "r":0.5850622407, + "f":0.615720524 + }, + "dislocated":{ + "p":0.2837837838, + "r":0.2658227848, + "f":0.2745098039 + }, + "acl":{ + "p":0.2142857143, + "r":0.06, + "f":0.09375 + }, + "goeswith":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "vocative":{ + "p":0.6666666667, + "r":0.6666666667, + "f":0.6666666667 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":4057.0598802323 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_trf/dep_trg.json b/models/v0.2.0/evals/tl_calamancy_trf/dep_trg.json new file mode 100644 index 0000000..d661c15 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_trf/dep_trg.json @@ -0,0 +1,257 @@ +{ + "tokenizer":{ + "token_acc":1.0, + "token_p":1.0, + "token_r":1.0, + "token_f":1.0 + }, + "transformer":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.8079019074 + }, + "tagger":{ + "tag_acc":0.583106267 + }, + "morphologizer":{ + "pos_acc":0.7847411444, + "morph_acc":0.7288828338, + "morph_micro_p":0.8404558405, + "morph_micro_r":0.7338308458, + "morph_micro_f":0.7835325365, + "morph_per_feat":{ + "Aspect":{ + "p":0.7340425532, + "r":0.71875, + "f":0.7263157895 + }, + "Mood":{ + "p":0.7234042553, + "r":0.7083333333, + "f":0.7157894737 + }, + "Voice":{ + "p":0.8936170213, + "r":0.8484848485, + "f":0.8704663212 + }, + "Case":{ + "p":0.8616071429, + "r":0.8772727273, + "f":0.8693693694 + }, + "Number":{ + "p":0.7638888889, + "r":0.9649122807, + "f":0.8527131783 + }, + "Person":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "PronType":{ + "p":0.9824561404, + "r":0.7, + "f":0.8175182482 + }, + "Foreign":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Degree":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Deixis":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "Gender":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Link":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "Polarity":{ + "p":0.9, + "r":0.5, + "f":0.6428571429 + }, + "Clusivity":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "Reflex":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "PartType":{ + "p":1.0, + "r":0.2307692308, + "f":0.375 + } + } + }, + "parser":{ + "sents_p":1.0, + "sents_r":1.0, + "sents_f":1.0, + "dep_uas":0.9494614747, + "dep_las":0.6777133389, + "dep_las_per_type":{ + "root":{ + "p":0.9765625, + "r":0.9765625, + "f":0.9765625 + }, + "case":{ + "p":0.9864864865, + "r":0.4124293785, + "f":0.5816733068 + }, + "nsubj":{ + "p":0.6982758621, + "r":0.8804347826, + "f":0.7788461538 + }, + "det":{ + "p":0.0761904762, + "r":0.7272727273, + "f":0.1379310345 + }, + "obj:agent":{ + "p":0.7222222222, + "r":0.9285714286, + "f":0.8125 + }, + "nsubj:pass":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "advmod":{ + "p":0.6428571429, + "r":0.7105263158, + "f":0.675 + }, + "csubj":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "obj":{ + "p":0.75, + "r":0.7272727273, + "f":0.7384615385 + }, + "nmod:poss":{ + "p":0.2857142857, + "r":1.0, + "f":0.4444444444 + }, + "compound:redup":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "ccomp":{ + "p":0.125, + "r":0.5, + "f":0.2 + }, + "fixed":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "mark":{ + "p":1.0, + "r":0.8571428571, + "f":0.9230769231 + }, + "advcl":{ + "p":0.8571428571, + "r":1.0, + "f":0.9230769231 + }, + "obl":{ + "p":0.8928571429, + "r":0.9259259259, + "f":0.9090909091 + }, + "iobj":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:lfoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "flat":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "nmod":{ + "p":0.25, + "r":0.1428571429, + "f":0.1818181818 + }, + "acl:relcl":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "nsubj:ifoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "aux":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "iobj:patient":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "nsubj:bfoc":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "dep":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "xcomp":{ + "p":0.0, + "r":0.0, + "f":0.0 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":116.3529955143 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_trf/dep_ugnayan.json b/models/v0.2.0/evals/tl_calamancy_trf/dep_ugnayan.json new file mode 100644 index 0000000..4f736f2 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_trf/dep_ugnayan.json @@ -0,0 +1,176 @@ +{ + "tokenizer":{ + "token_acc":0.9950884086, + "token_p":0.9774066798, + "token_r":0.9841740851, + "token_f":0.9807787087 + }, + "transformer":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":0.8255066733 + }, + "tagger":{ + "tag_acc":0.4848484848 + }, + "morphologizer":{ + "pos_acc":0.8220675944, + "morph_acc":0.5874751491, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":0.9684210526, + "sents_r":0.9787234043, + "sents_f":0.9735449735, + "dep_uas":0.8078026391, + "dep_las":0.5861476817, + "dep_las_per_type":{ + "vocative":{ + "p":1.0, + "r":1.0, + "f":1.0 + }, + "advmod":{ + "p":0.6101694915, + "r":0.75, + "f":0.6728971963 + }, + "mark":{ + "p":0.8181818182, + "r":0.3461538462, + "f":0.4864864865 + }, + "nsubj":{ + "p":0.7604166667, + "r":0.8021978022, + "f":0.7807486631 + }, + "root":{ + "p":0.8541666667, + "r":0.8723404255, + "f":0.8631578947 + }, + "case":{ + "p":0.7846153846, + "r":0.5862068966, + "f":0.6710526316 + }, + "obl":{ + "p":0.5882352941, + "r":0.6382978723, + "f":0.612244898 + }, + "nmod":{ + "p":0.6, + "r":0.3956043956, + "f":0.4768211921 + }, + "parataxis":{ + "p":0.625, + "r":0.4545454545, + "f":0.5263157895 + }, + "obj":{ + "p":0.5135135135, + "r":0.3454545455, + "f":0.4130434783 + }, + "nmod:poss":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "obj:agent":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "discourse":{ + "p":0.1428571429, + "r":0.5, + "f":0.2222222222 + }, + "advcl":{ + "p":0.4285714286, + "r":0.5454545455, + "f":0.48 + }, + "ccomp":{ + "p":0.125, + "r":1.0, + "f":0.2222222222 + }, + "det":{ + "p":0.1764705882, + "r":0.48, + "f":0.2580645161 + }, + "cc":{ + "p":1.0, + "r":0.9090909091, + "f":0.9523809524 + }, + "conj":{ + "p":0.7777777778, + "r":0.6666666667, + "f":0.7179487179 + }, + "nummod":{ + "p":0.7058823529, + "r":1.0, + "f":0.8275862069 + }, + "amod":{ + "p":0.6842105263, + "r":0.6046511628, + "f":0.6419753086 + }, + "fixed":{ + "p":1.0, + "r":0.5714285714, + "f":0.7272727273 + }, + "acl":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "acl:relcl":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "compound:redup":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "xcomp":{ + "p":0.2, + "r":0.3333333333, + "f":0.25 + }, + "flat":{ + "p":0.375, + "r":1.0, + "f":0.5454545455 + } + } + }, + "ner":{ + "ents_p":null, + "ents_r":null, + "ents_f":null, + "ents_per_type":null + }, + "speed":150.6587251329 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_trf/ner_tfnerd.json b/models/v0.2.0/evals/tl_calamancy_trf/ner_tfnerd.json new file mode 100644 index 0000000..3c638b5 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_trf/ner_tfnerd.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "transformer":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.7227665706, + "ents_r":0.8053949904, + "ents_f":0.7618469016, + "ents_per_type":{ + "LOC":{ + "p":0.6533996683, + "r":0.7755905512, + "f":0.7092709271 + }, + "PER":{ + "p":0.8344283837, + "r":0.9149855908, + "f":0.8728522337 + }, + "ORG":{ + "p":0.6064690027, + "r":0.6338028169, + "f":0.6198347107 + } + } + }, + "speed":2934.5668231186 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_trf/ner_tlunified.json b/models/v0.2.0/evals/tl_calamancy_trf/ner_tlunified.json new file mode 100644 index 0000000..740093f --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_trf/ner_tlunified.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":1.0, + "token_p":1.0, + "token_r":1.0, + "token_f":1.0 + }, + "transformer":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.8697478992, + "ents_r":0.917669411, + "ents_f":0.8930662558, + "ents_per_type":{ + "LOC":{ + "p":0.8624078624, + "r":0.9164490862, + "f":0.8886075949 + }, + "ORG":{ + "p":0.7706855792, + "r":0.8980716253, + "f":0.8295165394 + }, + "PER":{ + "p":0.9234449761, + "r":0.9267707083, + "f":0.9251048532 + } + } + }, + "speed":3185.8248123582 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_trf/ner_uner-trg.json b/models/v0.2.0/evals/tl_calamancy_trf/ner_uner-trg.json new file mode 100644 index 0000000..6745d50 --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_trf/ner_uner-trg.json @@ -0,0 +1,51 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "transformer":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.575, + "ents_r":1.0, + "ents_f":0.7301587302, + "ents_per_type":{ + "PER":{ + "p":0.5277777778, + "r":1.0, + "f":0.6909090909 + }, + "LOC":{ + "p":1.0, + "r":1.0, + "f":1.0 + } + } + }, + "speed":122.4830551364 +} \ No newline at end of file diff --git a/models/v0.2.0/evals/tl_calamancy_trf/ner_uner-ugnayan.json b/models/v0.2.0/evals/tl_calamancy_trf/ner_uner-ugnayan.json new file mode 100644 index 0000000..7bfdf8f --- /dev/null +++ b/models/v0.2.0/evals/tl_calamancy_trf/ner_uner-ugnayan.json @@ -0,0 +1,56 @@ +{ + "tokenizer":{ + "token_acc":null, + "token_p":null, + "token_r":null, + "token_f":null + }, + "transformer":{ + + }, + "trainable_lemmatizer":{ + "lemma_acc":null + }, + "tagger":{ + "tag_acc":null + }, + "morphologizer":{ + "pos_acc":null, + "morph_acc":null, + "morph_micro_p":null, + "morph_micro_r":null, + "morph_micro_f":null, + "morph_per_feat":null + }, + "parser":{ + "sents_p":null, + "sents_r":null, + "sents_f":null, + "dep_uas":null, + "dep_las":null, + "dep_las_per_type":null + }, + "ner":{ + "ents_p":0.6363636364, + "ents_r":0.8484848485, + "ents_f":0.7272727273, + "ents_per_type":{ + "PER":{ + "p":0.0, + "r":0.0, + "f":0.0 + }, + "LOC":{ + "p":0.9032258065, + "r":0.9032258065, + "f":0.9032258065 + }, + "ORG":{ + "p":0.0, + "r":0.0, + "f":0.0 + } + } + }, + "speed":205.6178333829 +} \ No newline at end of file diff --git a/models/v0.2.0/meta.json b/models/v0.2.0/meta.json new file mode 100644 index 0000000..965390f --- /dev/null +++ b/models/v0.2.0/meta.json @@ -0,0 +1,47 @@ +{ + "name": "tl_calamancy", + "lang": "tl", + "version": "0.1.0", + "spacy_version": ">=3.8.3", + "parent_package": "spacy", + "requirements": [ + "spacy-transformers==1.3.5" + ], + "description": "calamanCy: Tagalog NLP pipelines in spaCy", + "author": "Lester James V. Miranda", + "email": "ljvmiranda@gmail.com", + "url": "https://github.com/ljvmiranda921/calamanCy", + "license": "MIT", + "sources": [ + { + "name": "TLUnified NER Dataset", + "license": "GNU GPL 3.0", + "author": "Lester James V. Miranda", + "url": "https://aclanthology.org/2023.sealp-1.2/" + }, + { + "name": "UD NewsCrawl", + "license": "CC BY-SA 3.0", + "author": "Angelina Aquino and Lester James V. Miranda and Elsie Or", + "url": "https://huggingface.co/datasets/UD-Filipino/UD_Tagalog-NewsCrawl" + }, + { + "name": "TLUnified dataset", + "license": "GNU GPL 3.0", + "author": "Jan Christian Blaise Cruz and Charibeth Cheng", + "url": "https://aclanthology.org/2022.lrec-1.703/" + }, + { + "name": "UD_Tagalog-TRG", + "license": "CC BY-SA 3.0", + "author": "Stephanie Samson, Daniel Zeman, and Mary Ann C. Tan", + "url": "https://universaldependencies.org/treebanks/tl_trg/index.html" + }, + { + "name": "UD_Tagalog-Ugnayan", + "license": "CC BY-NC-SA 4.0", + "author": "Angelina Aquino", + "url": "https://universaldependencies.org/treebanks/tl_ugnayan/index.html" + } + ] +} \ No newline at end of file diff --git a/models/v0.2.0/project.yml b/models/v0.2.0/project.yml new file mode 100644 index 0000000..23a3834 --- /dev/null +++ b/models/v0.2.0/project.yml @@ -0,0 +1,425 @@ +title: "Release v0.2.0" +description: | + This is a spaCy project that trains the v0.2.0 models for calamanCy. + Here are some of the major changes in this release: + + - **Included trainable lemmatizer in the pipeline**: instead of a rules-based + lemmatizer, we are now using the [neural edit-tree + lemmatizer](https://explosion.ai/blog/edit-tree-lemmatizer). + - **Trained on UD-NewsCrawl**: this is a major update, as we are now training + our parser, tagger, and morphologizer components on the larger + [UD-NewsCrawl](https://huggingface.co/datasets/UD-Filipino/UD_Tagalog-NewsCrawl) + treebank. Our training dataset has now increased from 150+ to 15,000! From + this point forward, we will be using the UD-TRG and UD-Ugnayan treebanks as + test sets (as intended). + - **Better evaluations**: Aside from evaluating our dependency parser and POS tagger on UD-TRG and UD-Ugnayan, we have also included Universal NER ([Mayhew et al., 2023](https://arxiv.org/abs/2311.09122)) as our test set for evaluating the NER component. + - **Improved base model for tl_calamancy_trf**: Based on internal evaluations, we are now using [mDeBERTa-v3 (base)](https://huggingface.co/microsoft/mdeberta-v3-base) as our source of context-sensitive vectors for tl_calamancy_trf. + - **Simpler pipelines, no more pretraining**: We found that pretraining doesn't really offer huge performance gains (0-1%) given the huge effort and time needed to do it. Hence, for ease of training the whole pipeline, we removed it from the calamanCy recipe. + + The namespaces for the latest models remain the same. + The legacy models will have an explicit version number in their HuggingFace repositories. + Please see [this HuggingFace collection](https://huggingface.co/collections/ljvmiranda921/calamancy-models-for-tagalog-nlp-65629cc46ef2a1d0f9605c87) for more information. + + ## Set-up + + You can use this project to replicate the pipelines shipped by the project. + First, you need to install the required dependencies: + + ```sh + pip install -r requirements.txt + ``` + + Then run the set-up commands: + + ```sh + python -m spacy project assets + python -m spacy project run setup + ``` + + This step downloads all assets and prepares all the datasets and binaries for + training use. For example, if you want to train `tl_calamancy_md`, run the following comand: + + ```sh + MODEL=tl_calamancy_md scripts/train.sh + ``` + + + ## Model information + + The table below shows an overview of the calamanCy models in this project. For more information, + I suggest checking the [language pipeline metadata](https://spacy.io/api/language#meta). + + + | Model | Pipelines | Description | + |-----------------------------|---------------------------------------------|--------------------------------------------------------------------------------------------------------------| + | tl_calamancy_md (214 MB) | tok2vec, tagger, trainable_lemmatizer, morphologizer, parser, ner | CPU-optimized Tagalog NLP model. Pretrained using the TLUnified dataset. Using floret vectors (50k keys) | + | tl_calamancy_lg (482 MB) | tok2vec, tagger, trainable_lemmatizer, morphologizer, parser, ner | CPU-optimized large Tagalog NLP model. Pretrained using the TLUnified dataset. Using fastText vectors (714k) | + | tl_calamancy_trf (1.7 GB) | transformer, tagger, trainable_lemmatizer, morphologizer, parser, ner | GPU-optimized transformer Tagalog NLP model. Uses mdeberta-v3-base as context vectors. | + +vars: + # Versioning + version: 0.2.0 + # Training + lang: "tl" + gpu_id: 0 + vectors: "" + size: "" + # Evaluation + eval_model: "" + +directories: + - "assets" + - "configs" + - "corpus" + - "models" + - "packages" + - "scripts" + - "training" + - "evals" + - "vectors" + +assets: + - dest: "assets/tlunified_raw_text.txt" + description: "Pre-converted raw text from TLUnified in JSONL format (1.1 GB)." + url: "https://storage.googleapis.com/ljvmiranda/calamanCy/tlunified_raw_text.txt" + - dest: assets/corpus.tar.gz + description: "Annotated TLUnified corpora in spaCy format with train, dev, and test splits." + url: "https://storage.googleapis.com/ljvmiranda/calamanCy/tl_tlunified_gold/v1.0/corpus.tar.gz" + - dest: assets/tl_newscrawl-ud-train.conllu + description: "Train dataset for NewsCrawl" + url: https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-NewsCrawl/refs/heads/dev/tl_newscrawl-ud-train.conllu + - dest: assets/tl_newscrawl-ud-dev.conllu + description: "Dev dataset for NewsCrawl" + url: https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-NewsCrawl/refs/heads/dev/tl_newscrawl-ud-dev.conllu + - dest: assets/tl_newscrawl-ud-test.conllu + description: "Test dataset for NewsCrawl" + url: https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-NewsCrawl/refs/heads/dev/tl_newscrawl-ud-test.conllu + - dest: assets/tl_trg-ud-test.conllu + description: "Test dataset for TRG" + url: https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-TRG/refs/heads/master/tl_trg-ud-test.conllu + - dest: assets/tl_ugnayan-ud-test.conllu + description: "Test dataset for Ugnayan" + url: https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-Ugnayan/refs/heads/master/tl_ugnayan-ud-test.conllu + - dest: assets/uner_trg.iob2 + description: "Test dataset for Universal NER TRG" + url: https://raw.githubusercontent.com/UniversalNER/UNER_Tagalog-TRG/refs/heads/master/tl_trg-ud-test.iob2 + - dest: assets/uner_ugnayan.iob2 + description: "Test dataset for Universal NER Ugnayan" + url: https://raw.githubusercontent.com/UniversalNER/UNER_Tagalog-Ugnayan/refs/heads/master/tl_ugnayan-ud-test.iob2 + - dest: assets/tfnerd.txt + description: "Test dataset for TF-NERD" + url: https://huggingface.co/datasets/rkramos/tfnerd/raw/main/txt/test.txt + - dest: "assets/fasttext.tl.gz" + description: "Tagalog fastText vectors provided from the fastText website (trained from CommonCrawl and Wikipedia)." + url: "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz" + - dest: "assets/floret" + description: "Floret repository for training floret and fastText models." + git: + repo: "https://github.com/explosion/floret" + branch: "main" + path: "" + +workflows: + setup: + - "setup-finetuning-data" + - "setup-fasttext-vectors" + - "build-floret" + - "train-vectors-md" + tl-calamancy: + - "train-parser" + - "train-ner" + - "assemble" + tl-calamancy-trf: + - "train-parser-trf" + - "train-ner-trf" + - "assemble-trf" + evaluate: + - "setup-eval-data" + - "evaluate-model" + +commands: + - name: "setup-finetuning-data" + help: "Prepare the Tagalog corpora used for training various spaCy components" + script: + # ner: Extract Tagalog corpora + - mkdir -p corpus/ner + - "tar -xzvf assets/corpus.tar.gz -C corpus/ner" + # parser, tagger, morph: Convert treebank into spaCy format + - mkdir -p corpus/treebank + - >- + python -m spacy convert + assets/tl_newscrawl-ud-train.conllu corpus/treebank + --converter conllu + --morphology + --merge-subtokens + - >- + python -m spacy convert + assets/tl_newscrawl-ud-dev.conllu corpus/treebank + --converter conllu + --morphology + --merge-subtokens + - >- + python -m spacy convert + assets/tl_newscrawl-ud-test.conllu corpus/treebank + --converter conllu + --n-sents 1 + --morphology + --merge-subtokens + - >- + python -m spacy convert + assets/tl_ugnayan-ud-test.conllu corpus/treebank + --converter conllu + --n-sents 1 + --morphology + --merge-subtokens + - >- + python -m spacy convert + assets/tl_trg-ud-test.conllu corpus/treebank + --converter conllu + --n-sents 1 + --morphology + --merge-subtokens + deps: + - assets/corpus.tar.gz + - assets/tl_newscrawl-ud-train.conllu + - assets/tl_newscrawl-ud-dev.conllu + - assets/tl_newscrawl-ud-test.conllu + - assets/tl_ugnayan-ud-test.conllu + - assets/tl_trg-ud-test.conllu + outputs: + - corpus/ner/train.spacy + - corpus/ner/dev.spacy + - corpus/ner/test.spacy + - corpus/treebank/tl_newscrawl-ud-train.spacy + - corpus/treebank/tl_newscrawl-ud-dev.spacy + - corpus/treebank/tl_newscrawl-ud-test.spacy + - corpus/treebank/tl_ugnayan-ud-test.spacy + - corpus/treebank/tl_trg-ud-test.spacy + + - name: "setup-fasttext-vectors" + help: "Make fastText vectors spaCy compatible" + script: + - gzip -d -f assets/fasttext.tl.gz + - mkdir -p vectors/fasttext-tl + - >- + python -m spacy init vectors + tl assets/fasttext.tl vectors/fasttext-tl + deps: + - assets/fasttext.tl.gz + outputs: + - vectors/fasttext-tl + + - name: "build-floret" + help: "Build floret binary for training fastText / floret vectors" + script: + - make -C assets/floret + - chmod +x assets/floret/floret + deps: + - assets/floret + outputs: + - assets/floret/floret + + - name: "train-vectors-md" + help: "Train medium-sized word vectors (200 dims, 200k keys) using the floret binary." + script: + - mkdir -p assets/vectors/floret-tl-md/ + - >- + assets/floret/floret skipgram + -input assets/tlunified_raw_text.txt + -output assets/vectors/floret-tl-md/vectors + -dim 200 + -minn 3 + -maxn 5 + -mode floret + -hashCount 2 + -bucket 200000 + - mkdir -p vectors/floret-tl-md + - >- + python -m spacy init vectors + tl assets/vectors/floret-tl-md/vectors.floret vectors/floret-tl-md + --mode floret + deps: + - assets/floret/floret + outputs: + - vectors/floret-tl-md + + - name: "train-parser" + help: "Train a trainable_lemmatizer, parser, tagger, and morphologizer using the Universal Dependencies treebanks" + script: + - >- + python -m spacy train + configs/parser.cfg + --output training/parser_${vars.size}/ + --nlp.lang ${vars.lang} + --paths.train corpus/treebank/tl_newscrawl-ud-train.spacy + --paths.dev corpus/treebank/tl_newscrawl-ud-dev.spacy + --paths.vectors ${vars.vectors} + --gpu-id ${vars.gpu_id} + deps: + - corpus/treebank/tl_newscrawl-ud-train.spacy + - corpus/treebank/tl_newscrawl-ud-dev.spacy + - ${vars.vectors} + outputs: + - training/parser_${vars.size}/model-best + + - name: "train-parser-trf" + help: "Train a trainable_lemmatizer, parser, tagger, and morphologizer using the Universal Dependencies treebanks" + script: + - >- + python -m spacy train + configs/parser_trf.cfg + --output training/parser_trf/ + --nlp.lang ${vars.lang} + --components.transformer.model.name microsoft/mdeberta-v3-base + --paths.train corpus/treebank/tl_newscrawl-ud-train.spacy + --paths.dev corpus/treebank/tl_newscrawl-ud-dev.spacy + --gpu-id ${vars.gpu_id} + deps: + - corpus/treebank/tl_newscrawl-ud-train.spacy + - corpus/treebank/tl_newscrawl-ud-dev.spacy + outputs: + - training/parser_trf/model-best + + - name: "train-ner" + help: "Train ner component" + script: + - >- + python -m spacy train + configs/ner.cfg + --nlp.lang tl + --output training/ner_${vars.size}/ + --paths.train corpus/ner/train.spacy + --paths.dev corpus/ner/dev.spacy + --paths.vectors ${vars.vectors} + --gpu-id ${vars.gpu_id} + deps: + - corpus/ner/train.spacy + - corpus/ner/dev.spacy + - ${vars.vectors} + outputs: + - training/ner_${vars.size}/model-best + + - name: "train-ner-trf" + help: "Train ner component" + script: + - >- + python -m spacy train + configs/ner_trf.cfg + --nlp.lang tl + --output training/ner_trf/ + --components.transformer.model.name microsoft/mdeberta-v3-base + --paths.train corpus/ner/train.spacy + --paths.dev corpus/ner/dev.spacy + --gpu-id ${vars.gpu_id} + deps: + - corpus/ner/train.spacy + - corpus/ner/dev.spacy + outputs: + - training/ner_trf/model-best + + - name: "assemble" + help: "Assemble pipelines to create a single spaCy piepline" + script: + - >- + python -m spacy assemble configs/assemble.cfg models/tl_calamancy_${vars.size} + --paths.parser_model training/parser_${vars.size}/model-best + --paths.ner_model training/ner_${vars.size}/model-best + - >- + python -m spacy package models/tl_calamancy_${vars.size} packages/ + --meta ./meta.json + --name calamancy_${vars.size} + --version ${vars.version} + --build sdist,wheel + --force + deps: + - training/parser_${vars.size}/model-best + - training/ner_${vars.size}/model-best + + - name: "assemble-trf" + help: "Assemble pipelines to create a single spaCy piepline" + script: + - >- + python -m spacy assemble configs/assemble_trf.cfg models/tl_calamancy_trf + --paths.parser_model training/parser_trf/model-best + --paths.ner_model training/ner_trf/model-best + - >- + python -m spacy package models/tl_calamancy_trf packages/ + --meta ./meta.json + --name calamancy_trf + --version ${vars.version} + --build sdist,wheel + --force + deps: + - training/parser_trf/model-best + - training/ner_trf/model-best + + - name: "setup-eval-data" + help: "Convert remaining test datasets" + script: + - python -m scripts.convert assets/uner_trg.iob2 corpus/ner/uner-trg-test.spacy --source uner + - python -m scripts.convert assets/uner_ugnayan.iob2 corpus/ner/uner-ugnayan-test.spacy --source uner + - python -m scripts.convert assets/tfnerd.txt corpus/ner/tfnerd-test.spacy --source tfnerd + outputs: + - corpus/ner/uner-trg-test.spacy + - corpus/ner/uner-ugnayan-test.spacy + - corpus/ner/tfnerd-test.spacy + + - name: "evaluate-model" + help: "Evaluate a model" + script: + - mkdir -p evals/${vars.eval_model} + - >- + python -m pip install + https://huggingface.co/ljvmiranda921/${vars.eval_model}/resolve/main/${vars.eval_model}-any-py3-none-any.whl + # Dependency parsing (TRG, Ugnayan, NewsCrawl-Test) + - >- + python -m spacy evaluate + ${vars.eval_model} corpus/treebank/tl_ugnayan-ud-test.spacy + --output evals/${vars.eval_model}/dep_ugnayan.json + --gpu-id ${vars.gpu_id} + --per-component + - >- + python -m spacy evaluate + ${vars.eval_model} corpus/treebank/tl_trg-ud-test.spacy + --output evals/${vars.eval_model}/dep_trg.json + --gpu-id ${vars.gpu_id} + --per-component + - >- + python -m spacy evaluate + ${vars.eval_model} corpus/treebank/tl_newscrawl-ud-test.spacy + --output evals/${vars.eval_model}/dep_newscrawl.json + --gpu-id ${vars.gpu_id} + --per-component + # Named Entity Recognition (TLUnified-NER test, Universal NER filipino, TF-Nerd) + - >- + python -m spacy evaluate + ${vars.eval_model} corpus/ner/test.spacy + --output evals/${vars.eval_model}/ner_tlunified.json + --gpu-id ${vars.gpu_id} + --per-component + - >- + python -m spacy evaluate + ${vars.eval_model} corpus/ner/uner-ugnayan-test.spacy + --output evals/${vars.eval_model}/ner_uner-ugnayan.json + --gpu-id ${vars.gpu_id} + --per-component + - >- + python -m spacy evaluate + ${vars.eval_model} corpus/ner/uner-trg-test.spacy + --output evals/${vars.eval_model}/ner_uner-trg.json + --gpu-id ${vars.gpu_id} + --per-component + - >- + python -m spacy evaluate + ${vars.eval_model} corpus/ner/tfnerd-test.spacy + --output evals/${vars.eval_model}/ner-tfnerd.json + --gpu-id ${vars.gpu_id} + --per-component + deps: + - corpus/treebank/tl_newscrawl-ud-test.spacy + - corpus/treebank/tl_trg-ud-test.spacy + - corpus/treebank/tl_ugnayan-ud-test.spacy + - corpus/ner/test.spacy + - corpus/ner/uner-ugnayan-test.spacy + - corpus/ner/uner-trg-test.spacy + - corpus/ner/tfnerd-test.spacy diff --git a/models/v0.2.0/requirements.txt b/models/v0.2.0/requirements.txt new file mode 100644 index 0000000..eb91569 --- /dev/null +++ b/models/v0.2.0/requirements.txt @@ -0,0 +1,13 @@ +huggingface-hub==0.27.0 +spacy[cuda12x] +spacy-transformers==1.3.5 +torch==2.5.1 +transformers==4.36.2 +spacy-huggingface-hub +build +pip==22.0.2 +sentencepiece +protobuf +typer +pandas +tabulate \ No newline at end of file diff --git a/models/v0.2.0/scripts/convert.py b/models/v0.2.0/scripts/convert.py new file mode 100644 index 0000000..9f434ca --- /dev/null +++ b/models/v0.2.0/scripts/convert.py @@ -0,0 +1,118 @@ +from pathlib import Path +from typing import Optional + +import spacy +import typer +from spacy.tokens import Doc, DocBin, Span +from wasabi import msg + + +def convert( + # fmt: off + infile: Path = typer.Argument(..., help="Path to input file to convert."), + outfile: Path = typer.Argument(..., help="Path to save the converted DocBin in .spacy format."), + source: Optional[str] = typer.Option(None, "--source", help="Source of the dataset in order to determine how it will be converted.") + # fmt: on +): + if source == "uner": + texts = [] + labels = [] + + with infile.open("r", encoding="utf-8") as file: + current_text = [] + current_labels = [] + for line in file: + line = line.strip() + if line.startswith("# text ="): + if current_text: + texts.append(current_text) + labels.append(current_labels) + current_text = [] + current_labels = [] + elif line and not line.startswith("#"): + parts = line.split("\t") + if len(parts) >= 2: + word, label = parts[1], parts[2] + current_text.append(word) + current_labels.append(label) + if current_text: + texts.append(current_text) + labels.append(current_labels) + + elif source == "tfnerd": + texts = [] + labels = [] + + with infile.open("r", encoding="utf-8") as file: + current_text = [] + current_labels = [] + for line in file: + line = line.strip() + if line: + parts = line.split(" ") + if len(parts) >= 2: + word, label = parts[0], parts[1] + current_text.append(word) + current_labels.append(label) + else: + texts.append(current_text) + labels.append(current_labels) + current_text = [] + current_labels = [] + + else: + msg.fail(f"Unknown source: {source}", exits=1) + + # Perform conversion to DocBin + msg.info(f"Converting texts from {infile} to spaCy Doc objects (len={len(texts)})") + docs = [make_doc(tokens, label) for tokens, label in zip(texts, labels)] + + # Save docbin to outfile + doc_bin = DocBin(docs=docs) + doc_bin.to_disk(outfile) + msg.good(f"Saved {len(docs)} documents to {outfile}!") + + +def make_doc( + tokens: list[str], + labels: list[str], + allow_labels=["PER", "ORG", "LOC"], +) -> Doc: + nlp = spacy.blank("tl") + doc = Doc(nlp.vocab, words=tokens) + ents = [] + start = None + entity = None + + for i, (token, label) in enumerate(zip(tokens, labels)): + if label.startswith("B-"): + if start is not None: + ents.append((start, i, entity)) + start = i + entity = label[2:] + elif label.startswith("I-") and start is not None and entity == label[2:]: + continue + else: + if start is not None: + # Normalize entities + entity = entity.upper()[:3] + if entity == "GPE": + entity = "LOC" + # Add entity to list + ents.append((start, i, entity)) + start = None + entity = None + + if start is not None: + ents.append((start, len(tokens), entity)) + + doc.ents = [ + Span(doc, start, end, label=entity) + for start, end, entity in ents + if entity in allow_labels + ] + return doc + + +if __name__ == "__main__": + typer.run(convert) diff --git a/models/v0.2.0/scripts/report.py b/models/v0.2.0/scripts/report.py new file mode 100644 index 0000000..97be696 --- /dev/null +++ b/models/v0.2.0/scripts/report.py @@ -0,0 +1,78 @@ +from pathlib import Path +from typing import Any + +import typer +import pandas as pd +from srsly import read_json +from wasabi import msg + + +def report( + indir: Path = typer.Argument(..., help="Path to the evaluations directory.") +): + """Return a table of evaluation results + + The input to `indir` must be a directory where the first-level directories are the model names, + with JSON files from `spacy evaluate` in this file format: {task}_{dataset}.json + """ + results = [] + for model_dir in indir.iterdir(): + if model_dir.is_dir(): + model_name = model_dir.name + for json_file in model_dir.glob("*.json"): + task, dataset = json_file.stem.split("_") + data = read_json(json_file) + results.append((model_name, task, dataset, data)) + + msg.info(f"Found {len(results)} results in {indir}") + + msg.text("Parsing syntactic annotation results...") + syn_rows = [] + for model_name, task, dataset, data in results: + if task == "dep": + row = { + "model": model_name, + "dataset": dataset, + "token_acc": data.get("tokenizer").get("token_f"), + "lemma_acc": data.get("trainable_lemmatizer").get("lemma_acc"), + "tag_acc": data.get("tagger").get("tag_acc"), + "pos_acc": data.get("morphologizer").get("pos_acc"), + "morph_acc": data.get("morphologizer").get("morph_acc"), + "dep_uas": data.get("parser").get("dep_uas"), + "dep_las": data.get("parser").get("dep_las"), + } + syn_rows.append(row) + + def format_table(df: pd.DataFrame) -> pd.DataFrame: + df[df.select_dtypes(include="number").columns] *= 100 + df[df.select_dtypes(include="number").columns] = df.select_dtypes( + include="number" + ).round(2) + return df + + syn_df = format_table( + pd.DataFrame(syn_rows).sort_values(by="dataset").reset_index(drop=True) + ) + print(syn_df.to_markdown(index=False)) + + msg.text("Parsing NER results...") + ner_rows = [] + for model_name, task, dataset, data in results: + if task == "ner": + row = { + "model": model_name, + "dataset": dataset, + "ents_p": data.get("ner").get("ents_p"), + "ents_r": data.get("ner").get("ents_r"), + "ents_f": data.get("ner").get("ents_f"), + } + ner_rows.append(row) + + ner_df = format_table( + pd.DataFrame(ner_rows).sort_values(by="dataset").reset_index(drop=True) + ) + print(ner_df.to_markdown(index=False)) + + +if __name__ == "__main__": + typer.run(report) diff --git a/models/v0.2.0/scripts/train.sh b/models/v0.2.0/scripts/train.sh new file mode 100755 index 0000000..317c7c5 --- /dev/null +++ b/models/v0.2.0/scripts/train.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +MODEL="tl_calamancy_md" # Set this to tl_calamancy_md, tl_calamancy_lg, or tl_calamancy_trf as needed. + +case $MODEL in + tl_calamancy_md) + echo "Executing actions for tl_calamancy_md..." + python -m spacy project run tl-calamancy . \ + --vars.size md \ + --vars.vectors vectors/floret-tl-md + ;; + tl_calamancy_lg) + echo "Executing actions for tl_calamancy_lg..." + python -m spacy project run tl-calamancy . \ + --vars.size lg \ + --vars.vectors vectors/fasttext-tl + ;; + tl_calamancy_trf) + echo "Executing actions for tl_calamancy_trf..." + python -m spacy project run tl-calamancy-trf + # Add your commands here + ;; + *) + echo "Unknown MODEL: $MODEL" + ;; +esac \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bc43aae..af34c55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "calamanCy" -version = "0.1.4" +version = "0.2.0" authors = [ {name = "Lj V. Miranda", email = "ljvmiranda@gmail.com"} ] diff --git a/website/content/news/release-v010.md b/website/content/news/release-v010.md index 37b9946..9d5968b 100644 --- a/website/content/news/release-v010.md +++ b/website/content/news/release-v010.md @@ -27,9 +27,9 @@ The models are also [hosted on Huggingface](https://huggingface.co/ljvmiranda921 | Model | Pipelines | Description | | ------------------------------------------------------------------------------------ | ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | -| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) (73.7 MB) | tok2vec, tagger, morphologizer, parser, ner | CPU-optimized Tagalog NLP model. Pretrained using the TLUnified dataset. Using floret vectors (50k keys) | -| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_md) (431.9 MB) | tok2vec, tagger, morphologizer, parser, ner | CPU-optimized large Tagalog NLP model. Pretrained using the TLUnified dataset. Using fastText vectors (714k keys) | -| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) (775.6 MB) | transformer, tagger, parser, ner | GPU-optimized transformer Tagalog NLP model. Uses roberta-tagalog-base as context vectors. | +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md-0.1.0) (73.7 MB) | tok2vec, tagger, morphologizer, parser, ner | CPU-optimized Tagalog NLP model. Pretrained using the TLUnified dataset. Using floret vectors (50k keys) | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg-0.1.0) (431.9 MB) | tok2vec, tagger, morphologizer, parser, ner | CPU-optimized large Tagalog NLP model. Pretrained using the TLUnified dataset. Using fastText vectors (714k keys) | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf-0.1.0) (775.6 MB) | transformer, tagger, parser, ner | GPU-optimized transformer Tagalog NLP model. Uses roberta-tagalog-base as context vectors. | ## Performance and baselines @@ -51,9 +51,9 @@ The results show that our calamanCy pipelines are competitive (you can reproduce | Language Pipeline | Binary textcat (Hatespeech) | Multilabel textcat (Dengue) | NER (TLUnified-NER) | Dependency parsing, UAS (Merged UD) | Dependency parsing, LAS (Merged UD) | | ------------------------------------------------------------------------- | --------------------------- | --------------------------- | ------------------- | ----------------------------------- | ----------------------------------- | -| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) | 74.40 (0.05) | 65.32 (0.04) | 87.67 (0.03) | 76.47 | 54.40 | -| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg) | 75.62 (0.02) | 68.42 (0.01) | 88.90 (0.01) | 82.13 | 70.32 | -| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) | 78.25 (0.06) | 72.45 (0.02) | 90.34 (0.02) | 92.48 | 80.90 | +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md-0.1.0) | 74.40 (0.05) | 65.32 (0.04) | 87.67 (0.03) | 76.47 | 54.40 | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg-0.1.0) | 75.62 (0.02) | 68.42 (0.01) | 88.90 (0.01) | 82.13 | 70.32 | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf-0.1.0) | 78.25 (0.06) | 72.45 (0.02) | 90.34 (0.02) | 92.48 | 80.90 | We also evaluated cross-lingual and multilingual approaches in our benchmarks: diff --git a/website/content/news/release-v020.md b/website/content/news/release-v020.md index 4cf04e2..4a6e597 100644 --- a/website/content/news/release-v020.md +++ b/website/content/news/release-v020.md @@ -1,4 +1,126 @@ --- -title: Release v0.2.0 - Better dependency parsing and quality evaluations -date: "2025-02-15" +title: Release v0.2.0 - Better syntactic parsing and high-quality evaluations +date: "2025-01-19" --- + +Hi everyone, I am excited to release the v0.2.0 models for calamanCy. +This has been a long time coming as I've been preparing for this release since the end of 2023. +I am excited to highlight three features for this version: + +1. **Improved syntactic parsing from a larger treebank.** Before, we're training our dependency parser and morphological annotation models using a smaller treebank (~150 examples combined). Now, we have access to [UD-NewsCrawl](https://huggingface.co/datasets/UD-Filipino/UD_Tagalog-NewsCrawl), an expert-annotated treebank with 100x more examples! This allows us to train better syntactic parsing models for dependency parsing, POS tagging, and morphological annotation! + +2. **Updated spaCy components.** Due to the larger treebank, we now have the means to train a lemmatizer using spaCy's [neural edit-tree lemmatization](https://explosion.ai/blog/edit-tree-lemmatizer) approach. + This lemmatizer removes the need to handcraft rules and rely solely on statistical methods. + In addition, the [`tl_calamancy_trf`](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) pipeline now uses the modern [mDeBERTa-v3](https://huggingface.co/microsoft/mdeberta-v3-base) pretrained model as its base. + +3. **New NER evaluations.** New datasets have been built since the [last release of calamanCy](/calamanCy/news/release-v010/) and I've incorporated them here. This includes [Universal NER](https://www.universalner.org/) (Mayhew et al., 2024) and [TF-NERD](https://dl.acm.org/doi/abs/10.1145/3639233.3639341) (Ramos et al., 2024). I've also removed the TRG and Ugnayan treebanks from the training set and treated them as test sets (as they should be). + +You can find all the models in this [HuggingFace collection](https://huggingface.co/collections/ljvmiranda921/calamancy-models-for-tagalog-nlp-65629cc46ef2a1d0f9605c87): + +| Model | Pipelines | Description | +| ---------------------------------------------------------------------------------- | --------------------------------------------------------------------- | -------------------------------------------------------------------------------------- | +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) (214 MB) | tok2vec, tagger, trainable_lemmatizer, morphologizer, parser, ner | CPU-optimized Tagalog NLP model. Using floret vectors (50k keys) | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg) (482 MB) | tok2vec, tagger, trainable_lemmatizer, morphologizer, parser, ner | CPU-optimized large Tagalog NLP model. Using fastText vectors (714k) | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) (1.7 GB) | transformer, tagger, trainable_lemmatizer, morphologizer, parser, ner | GPU-optimized transformer Tagalog NLP model. Uses mdeberta-v3-base as context vectors. | + +## Improved syntactic parsing from a larger treebank + +One of the biggest updates in v0.2.0 is that we're now using the [UD-NewsCrawl treebank](https://huggingface.co/datasets/UD-Filipino/UD_Tagalog-NewsCrawl) for our syntactic parsing models. +This treebank contains 15,000 sentences with expert annotations on dependency relations, morphology, and tokenization— a huge jump from the ~150 examples we had before. + + + +This is all thanks to the annotation efforts made by [Elsie Or](https://linguistics.upd.edu.ph/building-a-tagalog-universal-dependencies-treebank/), [Angelina Aquino](https://angelaquino.github.io/), and their [team](https://linguistics.upd.edu.ph/building-a-tagalog-universal-dependencies-treebank/) from the University of the Philippines! +I was also partly involved in the project, focusing on post-processing and on training the baseline dependency parsers, so expect a paper from us soon! + +All the v0.2.0 models now use the UD-NewsCrawl treebank as their training set. I've also retired the TRG and Ugnayan treebanks and designated them as test sets for evaluation. Below, you'll find the syntactic parsing results for (1) the test split of UD-NewsCrawl and (2) the full datasets of TRG and Ugnayan. + +#### UD-NewsCrawl (test split) results + +This treebank consists of annotated text extracted from the Leipzig Tagalog Corpus. +Data included in the Leipzig Tagalog Corpus were crawled from Tagalog-language online news sites by the Leipzig University Institute for Computer Science. + +| Model | Token Acc. | Lemma Acc. | Tag Acc. | POS | Morph Acc. | Dep UAS | Dep LAS | +|:-----------------|------------:|------------:|----------:|----------:|------------:|----------:|----------:| +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) | 95.01 | 90.09 | 90.85 | 95 | 95.34 | 83.45 | 77.13 | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg) | 95.01 | 89.79 | 90.62 | 94.99 | 95.04 | 82.9 | 76.5 | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) | 95.01 | 90.46 | 91.34 | 95.43 | 95.32 | 85.09 | 78.83 | + + + +#### UD-TRG results + +This treebank was manually annotated using sentences from a grammar book. +The Tagalog treebank, so far, consists of 55 sentences with sources from the grammar books Tagalog Reference Grammar (Schachter and Otanes 1972) and Essential Tagalog Grammar: A Reference for Learners of Tagalog (De Vos 2010). The annotations are done manually. + +| Model | Token Acc. | Lemma Acc. | Tag Acc. | POS | Morph Acc. | Dep UAS | Dep LAS | +|:-----------------|------------:|------------:|----------:|----------:|------------:|----------:|----------:| +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) | 100 | 79.84 | 58.17 | 78.2 | 73.16 | 93.29 | 66.94 | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg) | 100 | 78.88 | 56.68 | 77.93 | 71.53 | 94.28 | 67.61 | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) | 100 | 80.79 | 58.31 | 78.47 | 72.89 | 94.95 | 67.77 | + +#### UD-Ugnayan results + +Ugnayan is a manually annotated Tagalog treebank currently composed of educational fiction and nonfiction text. +The treebank is under development at the University of the Philippines. + +| Model | Token Acc. | Lemma Acc. | Tag Acc. | POS | Morph Acc. | Dep UAS | Dep LAS | +|:-----------------|------------:|------------:|----------:|----------:|------------:|----------:|----------:| +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) | 98.08 | 82.29 | 49.16 | 82.82 | 59.41 | 79.7 | 57.32 | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg) | 98.08 | 82.29 | 48.58 | 81.67 | 58.95 | 80.92 | 58.47 | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) | 98.08 | 82.55 | 48.48 | 82.21 | 58.75 | 80.78 | 58.61 | + + +## Updated spaCy components + +This release also updates the [spaCy components](https://spacy.io/usage/processing-pipelines) included in the pipelines. +Think of a component as a specific step of a pipeline that performs a particular task, such as [POS tagging](https://spacy.io/api/tagger) or [named-entity recognition](https://spacy.io/api/entityrecognizer). +For v0.2.0, we added a new [trainable lemmatizer](https://spacy.io/api/edittreelemmatizer) to take advantage of the treebank we acquired. + +In addition, we also updated the transformer model and moved on from RoBERTa Tagalog (which served us quite well in the first release) to mDeBERTa. +From internal experiments, we saw that the updated multilingual transformer served as a more performance base model than a Tagalog-focused one. + +You can definitely see performance improvemnts across our previous benchmarks when comparing the previous versions of the transformer-based pipeline on TLUnifed-NER (NER), Hatespeech (binary text categorization) and Dengue (mutilabel text categorization) datasets: + +| Model | NER (TLUnified-NER) | Binary textcat (Hatespeech) | Multilabel textcat (Dengue) +|:-----------------|---------:|-----:| ----| +| tl_calamancy_trf v0.1.0 | 90.34 | 78.25 | 72.45 | +| tl_calamancy_trf v0.2.0 | **93.31** | **84.54** | **79.00** | + +## New NER evaluations + +Finally, I also added new NER evaluations based on new datasets published within the past year. +One of which, [Universal NER](https://arxiv.org/abs/2311.09122), is a project I contributed to. +It is a fun project— the goal is to follow the footsteps of Universal Dependencies and create a single annotation schema for NER. + +For Tagalog, we took the existing treebanks back then (TRG and Ugnayan) and annotated them in a [common annotation guideline](https://www.universalner.org/guidelines/). +Since UD-NewsCrawl is a new treebank, there are still no NER annotations for it yet. +If you're interested to help out and annotate NewsCrawl for NER, then [let us know](https://www.universalner.org/)! + +| Model | P (TRG) | R (TRG) |F (TRG) | P (Ugnayan) | R (Ugnayan) | F (Ugnayan) +|:-----------------|---------:|---------:|---------:| ---:| -----:| ----:| +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) | 57.5 | 100 | 73.02 | 58.97 | 69.70 | 63.89 | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg) | 100 | 95.65 | 97.78 | 60.47 | 78.79 | 68.42 | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf)| 100 | 95.65 | 97.78 | 63.64 | 84.84 | 72.73 | + +I also find this very interesting NER dataset called TF-NERD, which includes more named entity labels than PER (person), ORG (organization), and LOC (location). +For the evals below, I only evaluation on the three labels I had and converted all GPE (geopolitical entities) into LOC for fairness. + +| model | P (TF-NERD) | R (TF-NERD) | F (TF-NERD) | +|:-----------------|---------:|---------:|---------:| +| [tl_calamancy_md](https://huggingface.co/ljvmiranda921/tl_calamancy_md) | 68.36 | 70.78 | 69.55 | +| [tl_calamancy_lg](https://huggingface.co/ljvmiranda921/tl_calamancy_lg) | 67.26 | 70.58 | 68.88 | +| [tl_calamancy_trf](https://huggingface.co/ljvmiranda921/tl_calamancy_trf) | 72.28 | 80.54 | 76.18 | + + +## Final thoughts + +And that's it! +I hope you find these new models useful for Tagalog NLP. +Be sure to install [calamanCy](https://github.com/ljvmiranda921/calamanCy), and feel free to [submit any Issues on GitHub](https://github.com/ljvmiranda921/calamanCy/issues) if you have questions, bug reports, or feature requests! +Finally, I'd love to hear from you: if you're using calamanCy for a research paper or an application, then don't hesitate to [let me know](https://github.com/ljvmiranda921/calamanCy/issues/new?template=i-m-a-calamancy-user-.md)! \ No newline at end of file