Skip to content

Commit

Permalink
Refactor lemmatizer and data table integration (explosion#4353)
Browse files Browse the repository at this point in the history
* Move test

* Allow default in Lookups.get_table

* Start with blank tables in Lookups.from_bytes

* Refactor lemmatizer to hold instance of Lookups

* Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk)
* Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency
* Remove old and unsupported Lemmatizer.load classmethod
* Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need

* Update tests and docs

* Fix more tests

* Fix lemmatizer

* Upgrade pytest to try and fix weird CI errors

* Try pytest 4.6.5
  • Loading branch information
ines authored and honnibal committed Oct 1, 2019
1 parent 3297a19 commit cf65a80
Show file tree
Hide file tree
Showing 27 changed files with 333 additions and 332 deletions.
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ install:
- "pip install -e ."
script:
- "cat /proc/cpuinfo | grep flags | head -n 1"
- "pip install pytest pytest-timeout"
- "python -m pytest --tb=native spacy"
branches:
except:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
jsonschema>=2.6.0,<3.1.0
# Development dependencies
cython>=0.25
pytest>=4.0.0,<4.1.0
pytest>=4.6.5
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,7 @@ exclude =
__pycache__,
_tokenizer_exceptions_list.py,
spacy/__init__.py

[tool:pytest]
markers =
slow
6 changes: 6 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,12 @@ class Errors(object):
E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid on_match callback argument: expected "
"callable or None, but got: {arg_type}")
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
"Lemmatizer, initialize the class directly. See the docs for "
"details: https://spacy.io/api/lemmatizer")
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
"Lookups containing the lemmatization tables. See the docs for "
"details: https://spacy.io/api/lemmatizer#init")


@add_codes
Expand Down
8 changes: 5 additions & 3 deletions spacy/lang/el/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
from .norm_exceptions import NORM_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables
from ...util import update_exc, add_lookups


class GreekDefaults(Language.Defaults):
Expand All @@ -34,8 +35,9 @@ class GreekDefaults(Language.Defaults):

@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
if lookups is None:
lookups = Lookups()
return GreekLemmatizer(lookups)


class Greek(Language):
Expand Down
88 changes: 25 additions & 63 deletions spacy/lang/el/lemmatizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# coding: utf8
from __future__ import unicode_literals

from ...symbols import NOUN, VERB, ADJ, PUNCT
from ...lemmatizer import Lemmatizer


class GreekLemmatizer(object):
class GreekLemmatizer(Lemmatizer):
"""
Greek language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better Greek language support.
Expand All @@ -15,64 +15,26 @@ class GreekLemmatizer(object):
not applicable for Greek language.
"""

@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)

def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules
self.lookup_table = lookup if lookup is not None else {}

def __call__(self, string, univ_pos, morphology=None):
if not self.rules:
return [self.lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
univ_pos = "verb"
elif univ_pos in (ADJ, "ADJ", "adj"):
univ_pos = "adj"
elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = "punct"
else:
return list(set([string.lower()]))
lemmas = lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []),
)
return lemmas

def lookup(self, string, orth=None):
key = orth if orth is not None else string
if key in self.lookup_table:
return self.lookup_table[key]
return string


def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return list(set(forms))
def lemmatize(self, string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return list(set(forms))
8 changes: 5 additions & 3 deletions spacy/lang/fr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables
from ...util import update_exc, add_lookups


class FrenchDefaults(Language.Defaults):
Expand All @@ -33,8 +34,9 @@ class FrenchDefaults(Language.Defaults):

@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
if lookups is None:
lookups = Lookups()
return FrenchLemmatizer(lookups)


class French(Language):
Expand Down
87 changes: 41 additions & 46 deletions spacy/lang/fr/lemmatizer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# coding: utf8
from __future__ import unicode_literals

from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos


class FrenchLemmatizer(object):
class FrenchLemmatizer(Lemmatizer):
"""
French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support.
Expand All @@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
the lookup table.
"""

@classmethod
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
return cls(index, exc, rules, lookup)

def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index
self.exc = exceptions
self.rules = rules
self.lookup_table = lookup if lookup is not None else {}

def __call__(self, string, univ_pos, morphology=None):
if not self.rules:
return [self.lookup_table.get(string, string)]
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
Expand Down Expand Up @@ -56,12 +48,14 @@ def __call__(self, string, univ_pos, morphology=None):
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()]))
lemmas = lemmatize(
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lemmas = self.lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []),
self.lookup_table,
index_table.get(univ_pos, {}),
exc_table.get(univ_pos, {}),
rules_table.get(univ_pos, []),
)
return lemmas

Expand Down Expand Up @@ -115,33 +109,34 @@ def punct(self, string, morphology=None):
return self(string, "punct", morphology)

def lookup(self, string, orth=None):
if orth is not None and orth in self.lookup_table:
return self.lookup_table[orth][0]
lookup_table = self.lookups.get_table("lemma_lookup", {})
if orth is not None and orth in lookup_table:
return lookup_table[orth][0]
return string


def lemmatize(string, index, exceptions, rules, lookup):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup.keys():
forms.append(lookup[string][0])
if not forms:
forms.append(string)
return list(set(forms))
def lemmatize(self, string, index, exceptions, rules):
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(lookup_table[string][0])
if not forms:
forms.append(string)
return list(set(forms))
8 changes: 5 additions & 3 deletions spacy/lang/nl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups, get_lemma_tables
from ...util import update_exc, add_lookups


class DutchDefaults(Language.Defaults):
Expand All @@ -29,8 +30,9 @@ class DutchDefaults(Language.Defaults):

@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
if lookups is None:
lookups = Lookups()
return DutchLemmatizer(lookups)


class Dutch(Language):
Expand Down
Loading

0 comments on commit cf65a80

Please sign in to comment.