Skip to content

Commit

Permalink
Merge pull request #171 from megagonlabs/feature/spacy_v3
Browse files Browse the repository at this point in the history
Feature/spacy v3
  • Loading branch information
hiroshi-matsuda-rit authored Aug 21, 2021
2 parents cbd8294 + fdde135 commit 6e0e120
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 7 deletions.
3 changes: 2 additions & 1 deletion config/ja_ginza_electra.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ split_mode = "C"

[components.bunsetu_recognizer]
factory = "bunsetu_recognizer"
remain_bunsetu_suffix = true

[components.compound_splitter]
factory = "compound_splitter"
Expand Down Expand Up @@ -96,7 +97,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati

[components.transformer.model]
@architectures = "ginza-transformers.TransformerModel.v1"
name = "electra-base-ud-japanese-c4-wordpiece-conj-switch-a-discriminator.2"
name = "electra-base-ud-japanese-discriminator"

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
Expand Down
2 changes: 1 addition & 1 deletion config/ja_ginza_electra.meta.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"lang":"ja",
"name":"ginza_electra",
"version":"5.0.0a1",
"description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer.",
"description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.",
"author":"Megagon Labs Tokyo.",
"email":"[email protected]",
"url":"https://github.com/megagonlabs/ginza",
Expand Down
4 changes: 3 additions & 1 deletion ginza/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
def make_compound_splitter(
nlp: Language,
name: str,
split_mode=None,
split_mode: str = None,
):
return CompoundSplitter(
nlp.vocab,
Expand All @@ -70,9 +70,11 @@ def make_compound_splitter(
def make_bunsetu_recognizer(
nlp: Language,
name: str,
remain_bunsetu_suffix: bool = False,
):
return BunsetuRecognizer(
nlp.vocab,
remain_bunsetu_suffix,
)


Expand Down
14 changes: 12 additions & 2 deletions ginza/bunsetu_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,17 @@ def bunsetu_position_types(span: Span) -> List[str]:


class BunsetuRecognizer:
def __init__(self, nlp: Language) -> None:
def __init__(self, nlp: Language, remain_bunsetu_suffix: bool = False) -> None:
self.nlp = nlp
self._remain_bunsetu_suffix = remain_bunsetu_suffix

@property
def remain_bunsetu_suffix(self) -> str:
return self._remain_bunsetu_suffix

@remain_bunsetu_suffix.setter
def remain_bunsetu_suffix(self, remain: bool):
self._remain_bunsetu_suffix = remain

def __call__(self, doc: Doc) -> Doc:
debug = False
Expand All @@ -151,7 +160,8 @@ def __call__(self, doc: Doc) -> Doc:
heads[t.i] = True
elif t.dep_.endswith(BUNSETU_HEAD_SUFFIX):
heads[t.i] = True
t.dep_ = t.dep_[:-len(BUNSETU_HEAD_SUFFIX)]
if not self._remain_bunsetu_suffix:
t.dep_ = t.dep_[:-len(BUNSETU_HEAD_SUFFIX)]
for t in doc: # recovering uncovered subtrees
if heads[t.i]:
while t.head.i < t.i and not heads[t.head.i]:
Expand Down
17 changes: 16 additions & 1 deletion ginza/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import plac
import spacy
from sudachipy.morpheme import Morpheme
from spacy.tokens import Span

from spacy.lang.ja import JapaneseDefaults
Expand All @@ -26,11 +27,15 @@ def run(
output_format="0",
require_gpu=False,
disable_sentencizer=False,
use_normalized_form=False,
parallel=1,
files=None,
):
if require_gpu:
print("GPU enabled", file=sys.stderr)
if use_normalized_form:
print("overriding Token.lemma_ by normalized_form of SudachiPy", file=sys.stderr)
Morpheme.dictionary_form = Morpheme.normalized_form
assert model_path is None or ensure_model is None

analyzer = Analyzer(
Expand Down Expand Up @@ -187,7 +192,11 @@ def set_nlp(self):
try:
nlp = spacy.load("ja_ginza_electra")
except IOError as e:
nlp = spacy.load("ja_ginza")
try:
nlp = spacy.load("ja_ginza")
except IOError as e:
print('Could not find the model. You need to install "ja_ginza_electra" or "ja_ginza" by executing pip like `pip install ja_ginza_electra`.', file=sys.stderr)
raise e

if self.disable_sentencizer:
def disable_sentencizer(doc):
Expand Down Expand Up @@ -404,6 +413,7 @@ def mecab_token_line(token):
split_mode=("split mode", "option", "s", str, ["A", "B", "C", None]),
hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]),
output_path=("output path", "option", "o", Path),
use_normalized_form=("overriding Token.lemma_ by normalized_form of SudachiPy", "flag", "n"),
parallel=("parallel level (default=-1, all_cpus=0)", "option", "p", int),
files=("input files", "positional"),
)
Expand All @@ -412,6 +422,7 @@ def run_ginzame(
split_mode=None,
hash_comment="print",
output_path=None,
use_normalized_form=False,
parallel=-1,
*files,
):
Expand All @@ -423,6 +434,7 @@ def run_ginzame(
output_path=output_path,
output_format="mecab",
require_gpu=False,
use_normalized_form=use_normalized_form,
parallel=parallel,
disable_sentencizer=False,
files=files,
Expand All @@ -441,6 +453,7 @@ def main_ginzame():
output_path=("output path", "option", "o", Path),
output_format=("output format", "option", "f", str, ["0", "conllu", "1", "cabocha", "2", "mecab", "3", "json"]),
require_gpu=("enable require_gpu", "flag", "g"),
use_normalized_form=("overriding Token.lemma_ by normalized_form of SudachiPy", "flag", "n"),
disable_sentencizer=("disable spaCy's sentence separator", "flag", "d"),
parallel=("parallel level (default=1, all_cpus=0)", "option", "p", int),
files=("input files", "positional"),
Expand All @@ -453,6 +466,7 @@ def run_ginza(
output_path=None,
output_format="conllu",
require_gpu=False,
use_normalized_form=False,
disable_sentencizer=False,
parallel=1,
*files,
Expand All @@ -465,6 +479,7 @@ def run_ginza(
output_path=output_path,
output_format=output_format,
require_gpu=require_gpu,
use_normalized_form=use_normalized_form,
disable_sentencizer=disable_sentencizer,
parallel=parallel,
files=files,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@
name="ginza",
packages=find_packages(include=["ginza"]),
url="https://github.com/megagonlabs/ginza",
version='5.0.0a1',
version='5.0.0b1',
)

0 comments on commit 6e0e120

Please sign in to comment.