Skip to content

Commit

Permalink
maintenance: code linting (#130)
Browse files Browse the repository at this point in the history
* maintenance: code linting

* further linting
  • Loading branch information
adbar authored May 23, 2024
1 parent 7fd3de6 commit 2caac80
Show file tree
Hide file tree
Showing 33 changed files with 62 additions and 174 deletions.
6 changes: 3 additions & 3 deletions simplemma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@


from .language_detector import LanguageDetector, in_target_language, langdetect
from .lemmatizer import Lemmatizer, lemmatize, lemma_iterator, text_lemmatizer, is_known
from .tokenizer import Tokenizer, RegexTokenizer, simple_tokenizer
from .lemmatizer import Lemmatizer, is_known, lemma_iterator, lemmatize, text_lemmatizer
from .token_sampler import (
TokenSampler,
BaseTokenSampler,
MostCommonTokenSampler,
RelaxedMostCommonTokenSampler,
TokenSampler,
)
from .tokenizer import RegexTokenizer, Tokenizer, simple_tokenizer
4 changes: 2 additions & 2 deletions simplemma/language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
from operator import itemgetter
from typing import Dict, List, Tuple, Union

from .strategies import LemmatizationStrategy, DefaultStrategy
from .strategies import DefaultStrategy, LemmatizationStrategy
from .token_sampler import (
TokenSampler,
MostCommonTokenSampler,
RelaxedMostCommonTokenSampler,
TokenSampler,
)
from .utils import validate_lang_input

Expand Down
9 changes: 4 additions & 5 deletions simplemma/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,17 @@
"""

from functools import lru_cache
from typing import Any, List, Iterator, Tuple, Union

from typing import Any, Iterator, List, Tuple, Union

from .strategies import (
LemmatizationStrategy,
DefaultDictionaryFactory,
DefaultStrategy,
DictionaryLookupStrategy,
DefaultDictionaryFactory,
LemmatizationFallbackStrategy,
LemmatizationStrategy,
ToLowercaseFallbackStrategy,
)
from .tokenizer import Tokenizer, RegexTokenizer
from .tokenizer import RegexTokenizer, Tokenizer
from .utils import validate_lang_input

PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}
Expand Down
18 changes: 8 additions & 10 deletions simplemma/strategies/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
"""Simplemma strategies module"""

from .dictionaries import DictionaryFactory, DefaultDictionaryFactory

from .lemmatization_strategy import LemmatizationStrategy
from .dictionary_lookup import DictionaryLookupStrategy
from .hyphen_removal import HyphenRemovalStrategy
from .rules import RulesStrategy
from .prefix_decomposition import PrefixDecompositionStrategy
from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy
from .affix_decomposition import AffixDecompositionStrategy
from .default import DefaultStrategy

from .dictionaries import DefaultDictionaryFactory, DictionaryFactory
from .dictionary_lookup import DictionaryLookupStrategy
from .fallback.lemmatization_fallback_strategy import LemmatizationFallbackStrategy
from .fallback.to_lowercase import ToLowercaseFallbackStrategy
from .fallback.raise_error import RaiseErrorFallbackStrategy
from .fallback.to_lowercase import ToLowercaseFallbackStrategy
from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy
from .hyphen_removal import HyphenRemovalStrategy
from .lemmatization_strategy import LemmatizationStrategy
from .prefix_decomposition import PrefixDecompositionStrategy
from .rules import RulesStrategy
6 changes: 3 additions & 3 deletions simplemma/strategies/affix_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

from typing import Optional

from .lemmatization_strategy import LemmatizationStrategy
from .dictionary_lookup import DictionaryLookupStrategy
from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy, SHORTER_GREEDY
from .greedy_dictionary_lookup import SHORTER_GREEDY, GreedyDictionaryLookupStrategy
from .lemmatization_strategy import LemmatizationStrategy

# TODO: This custom behavior has to be simplified before it becomes unmaintainable
LONGER_AFFIXES = {"et", "fi", "hu", "lt"}
Expand Down Expand Up @@ -76,7 +76,7 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]:
Optional[str]: The lemma of the token if found, or None otherwise.
"""
limit = 6 if lang in SHORTER_GREEDY else 8
if (not self._greedy and not lang in AFFIX_LANGS) or len(token) <= limit:
if (not self._greedy and lang not in AFFIX_LANGS) or len(token) <= limit:
return None

# define parameters
Expand Down
10 changes: 5 additions & 5 deletions simplemma/strategies/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

from typing import Optional

from .dictionaries.dictionary_factory import DictionaryFactory, DefaultDictionaryFactory
from .lemmatization_strategy import LemmatizationStrategy
from .affix_decomposition import AffixDecompositionStrategy
from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
from .dictionary_lookup import DictionaryLookupStrategy
from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy
from .hyphen_removal import HyphenRemovalStrategy
from .rules import RulesStrategy
from .lemmatization_strategy import LemmatizationStrategy
from .prefix_decomposition import PrefixDecompositionStrategy
from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy
from .affix_decomposition import AffixDecompositionStrategy
from .rules import RulesStrategy


class DefaultStrategy(LemmatizationStrategy):
Expand Down
1 change: 0 additions & 1 deletion simplemma/strategies/defaultprefixes/ru.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re


RUSSIAN_PREFIXES = [
"гидро",
"за",
Expand Down
1 change: 0 additions & 1 deletion simplemma/strategies/defaultrules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from .pl import apply_pl
from .ru import apply_ru


DEFAULT_RULES: Dict[str, Callable[[str], Optional[str]]] = {
"de": apply_de,
"en": apply_en,
Expand Down
72 changes: 0 additions & 72 deletions simplemma/strategies/defaultrules/de.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import re

from typing import Optional


NOUN_ENDINGS_DE = re.compile(
r"(?:erei|heit|keit|ion|schaft|tät|[^jlz]ung)(en)?$|"
r"(?:euse|icen|logie)(n)?$|"
Expand All @@ -28,76 +26,6 @@
ENDING_CHARS_DE = {"e", "m", "n", "r", "s"}
ENDING_DE = re.compile(r"(?:e|em|er|es)$")

# 2-letter prefixes are theoretically already accounted for by the current AFFIXLEN parameter
GERMAN_PREFIXES = [
"ab",
"an",
"auf",
"aus",
"be",
"bei",
"da",
"dar",
"darin",
"davor",
"durch",
"ein",
"ent",
"entgegen",
"er",
"gegen",
"heim",
"her",
"herab",
"heran",
"herauf",
"heraus",
"herbei",
"herein",
"herum",
"herunter",
"hervor",
"hin",
"hinab",
"hinauf",
"hinaus",
"hinein",
"hinten",
"hinter",
"hinunter",
"hinweg",
"hinzu",
"innen",
"los",
"miss",
"mit",
"nach",
"neben",
"nieder",
"ran",
"raus",
"rein",
"rum",
"runter",
"über",
"um",
"unter",
"ver",
"vor",
"voran",
"voraus",
"vorbei",
"vorher",
"vorüber",
"weg",
"weiter",
"wieder",
"zer",
"zu",
]

DE_PREFIX_REGEX = re.compile(r"^(" + "|".join(GERMAN_PREFIXES) + ")(?!zu)")


def apply_de(token: str) -> Optional[str]:
"Apply pre-defined rules for German."
Expand Down
2 changes: 0 additions & 2 deletions simplemma/strategies/defaultrules/fi.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import re

from typing import Optional

from .generic import apply_rules


DEFAULT_RULES = {
# -minen nouns, ä/ö/y + a/o/u
# https://en.wiktionary.org/wiki/-minen
Expand Down
2 changes: 0 additions & 2 deletions simplemma/strategies/defaultrules/pl.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import re

from typing import Optional

from .generic import apply_rules


DEFAULT_RULES = {
re.compile(r"(?:ościach|ościami|ościom)$"): "ość", # removed: "ością", "ości"
re.compile(
Expand Down
26 changes: 0 additions & 26 deletions simplemma/strategies/defaultrules/ru.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,14 @@
import re

from typing import Optional

from .generic import apply_rules


RUSSIAN_PREFIXES = [
"гидро",
"за",
"контр",
"много",
"микро",
"недо",
"пере",
"под",
"пред",
"при",
"про",
"радио",
"раз",
"рас",
"само",
"экстра",
"электро",
]


DEFAULT_RULES = {
re.compile(r"(?:ости|остью|остей|остям|остями|остях)$"): "ость",
re.compile(r"(?:ства|ств|ству|ствам|ством|ствами|стве|ствах)$"): "ство",
}


RU_PREFIX_REGEX = re.compile(r"^(" + "|".join(RUSSIAN_PREFIXES) + ")")


def apply_ru(token: str) -> Optional[str]:
"Apply pre-defined rules for Russian."
if token.endswith("ё"):
Expand Down
2 changes: 1 addition & 1 deletion simplemma/strategies/dictionaries/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Dictionary-based lemmatization strategy."""

from .dictionary_factory import DictionaryFactory, DefaultDictionaryFactory
from .dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
3 changes: 1 addition & 2 deletions simplemma/strategies/dictionaries/dictionary_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
import pickle
import sys
from abc import abstractmethod
from functools import lru_cache
from os import listdir, path
from pathlib import Path
from typing import Dict

from functools import lru_cache

if sys.version_info >= (3, 8):
from typing import Protocol
else:
Expand Down
2 changes: 1 addition & 1 deletion simplemma/strategies/dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from typing import Optional

from .dictionaries.dictionary_factory import DictionaryFactory, DefaultDictionaryFactory
from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
from .lemmatization_strategy import LemmatizationStrategy


Expand Down
2 changes: 1 addition & 1 deletion simplemma/strategies/fallback/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Simplemma fallback strategies module"""

from .lemmatization_fallback_strategy import LemmatizationFallbackStrategy
from .to_lowercase import ToLowercaseFallbackStrategy
from .raise_error import RaiseErrorFallbackStrategy
from .to_lowercase import ToLowercaseFallbackStrategy
6 changes: 2 additions & 4 deletions simplemma/strategies/greedy_dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
It provides lemmatization using a greedy dictionary lookup strategy.
"""

from typing import Optional

from .lemmatization_strategy import LemmatizationStrategy
from .dictionaries.dictionary_factory import DictionaryFactory, DefaultDictionaryFactory
from ..utils import levenshtein_dist
from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
from .lemmatization_strategy import LemmatizationStrategy

SHORTER_GREEDY = {"bg", "et", "fi"}

Expand Down
1 change: 0 additions & 1 deletion simplemma/strategies/hyphen_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from .dictionary_lookup import DictionaryLookupStrategy
from .lemmatization_strategy import LemmatizationStrategy


HYPHENS = {"-", "_"}
HYPHENS_FOR_REGEX = "".join(HYPHENS)
HYPHEN_REGEX = re.compile(rf"([{HYPHENS_FOR_REGEX}])")
Expand Down
2 changes: 1 addition & 1 deletion simplemma/strategies/lemmatization_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"""

import sys
from typing import Optional
from abc import abstractmethod
from typing import Optional

if sys.version_info >= (3, 8):
from typing import Protocol
Expand Down
3 changes: 1 addition & 2 deletions simplemma/strategies/prefix_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

from typing import Dict, Optional, Pattern

from .dictionary_lookup import DictionaryLookupStrategy
from .defaultprefixes import DEFAULT_KNOWN_PREFIXES

from .dictionary_lookup import DictionaryLookupStrategy
from .lemmatization_strategy import LemmatizationStrategy


Expand Down
1 change: 0 additions & 1 deletion simplemma/strategies/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import Callable, Dict, Optional

from .defaultrules import DEFAULT_RULES

from .lemmatization_strategy import LemmatizationStrategy


Expand Down
5 changes: 3 additions & 2 deletions simplemma/token_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
import re
import sys
from abc import ABC, abstractmethod
from typing import Iterable, List
from collections import Counter
from .tokenizer import Tokenizer, RegexTokenizer
from typing import Iterable, List

from .tokenizer import RegexTokenizer, Tokenizer

if sys.version_info >= (3, 8):
from typing import Protocol
Expand Down
1 change: 0 additions & 1 deletion simplemma/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import re
import sys

from abc import abstractmethod
from typing import Iterator, List, Pattern

Expand Down
Loading

0 comments on commit 2caac80

Please sign in to comment.