Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use binary strings in dictionaries to save memory #128

Merged
merged 9 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified simplemma/strategies/dictionaries/data/ast.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/bg.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ca.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/cs.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/cy.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/da.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/de.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/el.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/en.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/enm.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/es.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/et.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/fa.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/fi.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/fr.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ga.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/gd.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/gl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/gv.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hbs.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hi.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hu.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/hy.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/id.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/is.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/it.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ka.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/la.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/lb.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/lt.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/lv.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/mk.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ms.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/nb.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/nl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/nn.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/pl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/pt.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ro.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/ru.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/se.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sk.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sq.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sv.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/sw.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/tl.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/tr.plzma
Binary file not shown.
Binary file modified simplemma/strategies/dictionaries/data/uk.plzma
Binary file not shown.
10 changes: 5 additions & 5 deletions simplemma/strategies/dictionaries/dictionary_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from functools import lru_cache
from os import listdir, path
from pathlib import Path
from typing import Dict
from typing import ByteString, Dict

if sys.version_info >= (3, 8):
from typing import Protocol
Expand All @@ -30,7 +30,7 @@
]


def _load_dictionary_from_disk(langcode: str) -> Dict[str, str]:
def _load_dictionary_from_disk(langcode: str) -> Dict[ByteString, ByteString]:
"""
Load a dictionary from disk.

Expand Down Expand Up @@ -68,7 +68,7 @@ class DictionaryFactory(Protocol):
def get_dictionary(
self,
lang: str,
) -> Dict[str, str]:
) -> Dict[ByteString, ByteString]:
"""
Get the dictionary for a specific language.

Expand Down Expand Up @@ -102,15 +102,15 @@ def __init__(self, cache_max_size: int = 8):
cache_max_size (int): The maximum size of the cache for loaded dictionaries.
Defaults to `8`.
"""
self._data: Dict[str, Dict[str, str]] = {}
self._data: Dict[str, Dict[ByteString, ByteString]] = {}
self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
_load_dictionary_from_disk
)

def get_dictionary(
self,
lang: str,
) -> Dict[str, str]:
) -> Dict[ByteString, ByteString]:
"""
Get the dictionary for a specific language.

Expand Down
16 changes: 12 additions & 4 deletions simplemma/strategies/dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
It provides lemmatization using dictionary lookup.
"""

from typing import Optional
from typing import ByteString, Dict, Optional

from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
from .lemmatization_strategy import LemmatizationStrategy
Expand All @@ -26,6 +26,13 @@ def __init__(
"""
self._dictionary_factory = dictionary_factory

def _get(
self, token: str, dictionary: Dict[ByteString, ByteString]
) -> Optional[str]:
"Convenience function to handle bytestring to string conversion."
result = dictionary.get(token.encode("utf-8"))
return result.decode("utf-8") if result else None # type: ignore[union-attr]

def get_lemma(self, token: str, lang: str) -> Optional[str]:
"""
Get Lemma using Dictionary Lookup
Expand All @@ -43,8 +50,9 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]:
"""
# Search the language data, reverse case to extend coverage.
dictionary = self._dictionary_factory.get_dictionary(lang)
if token in dictionary:
return dictionary[token]
result = self._get(token, dictionary)
if result:
return result
# Try upper or lowercase.
token = token.lower() if token[0].isupper() else token.capitalize()
return dictionary.get(token)
return self._get(token, dictionary)
4 changes: 2 additions & 2 deletions simplemma/strategies/greedy_dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_lemma(self, token: str, lang: str) -> str:
return token

dictionary = self._dictionary_factory.get_dictionary(lang)
candidate = token
candidate = token.encode("utf-8")
for _ in range(self._steps):
if candidate not in dictionary:
break
Expand All @@ -73,4 +73,4 @@ def get_lemma(self, token: str, lang: str) -> str:

candidate = new_candidate

return candidate
return candidate.decode("utf-8")
8 changes: 6 additions & 2 deletions simplemma/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
- [validate_lang_input][simplemma.utils.validate_lang_input]: Validates the language input and ensures it is a valid tuple.
"""

from typing import Tuple, Union
from typing import ByteString, Tuple, Union


def validate_lang_input(lang: Union[str, Tuple[str, ...]]) -> Tuple[str]:
Expand All @@ -31,7 +31,9 @@ def validate_lang_input(lang: Union[str, Tuple[str, ...]]) -> Tuple[str]:
return lang # type: ignore[return-value]


def levenshtein_dist(str1: str, str2: str) -> int:
def levenshtein_dist(
first: Union[ByteString, str], second: Union[ByteString, str]
) -> int:
"""
Calculate the Levenshtein distance between two strings.

Expand All @@ -47,6 +49,8 @@ def levenshtein_dist(str1: str, str2: str) -> int:
int: The Levenshtein distance between the two strings.

"""
str1 = first.encode("utf-8") if isinstance(first, str) else first
str2 = second.encode("utf-8") if isinstance(second, str) else second
# inspired by this noticeably faster code:
# https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b
if str1 == str2:
Expand Down
5 changes: 3 additions & 2 deletions tests/test_dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def test_logic() -> None:
# different order
mydict = dictionary_pickler._read_dict(testfile, "es", silent=True)
assert len(mydict) == 5
assert mydict["closeones"] == "closeone"
assert mydict[b"closeones"] == b"closeone"
item = sorted(mydict.keys(), reverse=True)[0]
assert item == "valid-word"
assert item == b"valid-word"

# file I/O
assert dictionary_pickler._determine_path("lists", "de").endswith("de.txt")
Expand All @@ -37,3 +37,4 @@ def test_logic() -> None:
listpath = os.path.join(TEST_DIR, "data")
os_handle, temp_outputfile = tempfile.mkstemp(suffix=".pkl", text=True)
dictionary_pickler._pickle_dict("zz", listpath, temp_outputfile)
dictionary_pickler._pickle_dict("zz", listpath, in_place=True)
6 changes: 3 additions & 3 deletions tests/test_lemmatizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Tests for `simplemma` package."""

from typing import Dict
from typing import ByteString, Dict

import pytest

Expand All @@ -17,8 +17,8 @@ class CustomDictionaryFactory(DictionaryFactory):
def get_dictionary(
self,
lang: str,
) -> Dict[str, str]:
return {"testing": "the test works!!"}
) -> Dict[ByteString, ByteString]:
return {b"testing": b"the test works!!"}

assert (
Lemmatizer(
Expand Down
27 changes: 19 additions & 8 deletions training/dictionary_pickler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import re
from operator import itemgetter
from pathlib import Path
from typing import Dict, List, Optional
from typing import ByteString, Dict, List, Optional

import simplemma
from simplemma.strategies.defaultrules import DEFAULT_RULES
Expand Down Expand Up @@ -49,7 +49,9 @@ def _determine_path(listpath: str, langcode: str) -> str:
return str(Path(__file__).parent / filename)


def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
def _read_dict(
filepath: str, langcode: str, silent: bool
) -> Dict[ByteString, ByteString]:
mydict: Dict[str, str] = {}
myadditions: List[str] = []
i: int = 0
Expand Down Expand Up @@ -80,8 +82,8 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
# print line if the rule is wrong
if (
len(columns[1]) > 6
and columns[1] != columns[0]
and langcode in DEFAULT_RULES
and columns[1] != columns[0]
):
rule = DEFAULT_RULES[langcode](columns[1])
if rule is not None and rule != columns[1]:
Expand Down Expand Up @@ -119,32 +121,41 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
for word in myadditions:
mydict[word] = word
LOGGER.debug("%s %s", langcode, i)
return dict(sorted(mydict.items()))
# sort and convert to bytestrings
return {k.encode("utf-8"): v.encode("utf-8") for k, v in sorted(mydict.items())}


def _load_dict(
langcode: str, listpath: str = "lists", silent: bool = True
) -> Dict[str, str]:
) -> Dict[ByteString, ByteString]:
filepath = _determine_path(listpath, langcode)
return _read_dict(filepath, langcode, silent)


def _pickle_dict(
langcode: str, listpath: str = "lists", filepath: Optional[str] = None
langcode: str = "en",
listpath: str = "lists",
filepath: Optional[str] = None,
in_place: bool = False,
) -> None:
mydict = _load_dict(langcode, listpath)
# sort dictionary to help saving space during compression
if langcode not in ("lt", "sw"):
mydict = dict(sorted(mydict.items(), key=itemgetter(1)))
if filepath is None:
filename = f"strategies/dictionaries/data/{langcode}.plzma"
filepath = str(Path(simplemma.__file__).parent / filename)
directory = (
Path(simplemma.__file__).parent
if in_place
else Path(__file__).parent.parent / "simplemma"
)
filepath = str(directory / filename)
with lzma.open(filepath, "wb") as filehandle: # , filters=my_filters, preset=9
pickle.dump(mydict, filehandle, protocol=4)
LOGGER.debug("%s %s", langcode, len(mydict))


if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
for listcode in SUPPORTED_LANGUAGES:
for listcode in sorted(SUPPORTED_LANGUAGES):
_pickle_dict(listcode)
Loading