roundup before release 0.8.2

adbar · Sep 5, 2022 · 8ff2546 · 8ff2546
1 parent 781e6a9
commit 8ff2546
Show file tree

Hide file tree

Showing 6 changed files with 92 additions and 61 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,6 +3,13 @@ History
 =======
 
 
+0.8.2
+-----
+
+* languages added: Albanian, Hindi, Icelandic, Malay, Middle English, Northern Sámi, Nynorsk, Serbo-Croatian, Swahili, Tagalog
+* fix for slow language detection introduced in 0.7.0
+
+
 0.8.1
 -----
 

diff --git a/README.rst b/README.rst
@@ -39,7 +39,7 @@ In modern natural language processing (NLP), this task is often indirectly tackl
 
 With its comparatively small footprint it is especially useful when speed and simplicity matter, in low-resource contexts, for educational purposes, or as a baseline system for lemmatization and morphological analysis.
 
-Currently, 38 languages are partly or fully supported (see table below).
+Currently, 48 languages are partly or fully supported (see table below).
 
 
 Installation
@@ -171,7 +171,7 @@ Bug reports over the `issues page <https://github.com/adbar/simplemma/issues>`_
 Language detection
 ~~~~~~~~~~~~~~~~~~
 
-Language detection works by providing a text and a tuple ``lang`` consisting of a series of languages of interest. Scores between 0 and 1 are returned.
+Language detection works by providing a text and tuple ``lang`` consisting of a series of languages of interest. Scores between 0 and 1 are returned.
 
 The ``lang_detector()`` function returns a list of language codes along with scores and adds "unk" for unknown or out-of-vocabulary words. The latter can also be calculated by using the function ``in_target_language()`` which returns a ratio.
 
@@ -183,7 +183,7 @@ The ``lang_detector()`` function returns a list of language codes along with sco
     >>> lang_detector('"Moderní studie narazily na několik tajemství." Extracted from Wikipedia.', lang=("cs", "sk"))
     [('cs', 0.625), ('unk', 0.375), ('sk', 0.125)]
     # proportion of known words
-    >>> in_target_language("opera post physica posita (τὰ μετὰ τὰ φυσικά)", lang=("la",))
+    >>> in_target_language("opera post physica posita (τὰ μετὰ τὰ φυσικά)", lang="la")
     0.5
 
 
@@ -193,57 +193,69 @@ Supported languages
 The following languages are available using their `ISO 639-1 code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_:
 
 
-====== ================== =========== ===== =========================================================================
-Available languages (2022-04-06)
----------------------------------------------------------------------------------------------------------------------
-Code   Language           Words (10³) Acc.  Comments
-====== ================== =========== ===== =========================================================================
-``bg`` Bulgarian          213
-``ca`` Catalan            579
-``cs`` Czech              187         0.88  on UD CS-PDT
-``cy`` Welsh              360
-``da`` Danish             554         0.92  on UD DA-DDT, alternative: `lemmy <https://github.com/sorenlind/lemmy>`_
-``de`` German             682         0.95  on UD DE-GSD, see also `German-NLP list <https://github.com/adbar/German-NLP#Lemmatization>`_
-``el`` Greek              183         0.88  on UD EL-GDT
-``en`` English            136         0.94  on UD EN-GUM, alternative: `LemmInflect <https://github.com/bjascob/LemmInflect>`_
-``es`` Spanish            720         0.94  on UD ES-GSD
-``et`` Estonian           133               low coverage
-``fa`` Persian            10                low coverage, potential issues
-``fi`` Finnish            2,106             evaluation and alternatives: see `this benchmark <https://github.com/aajanki/finnish-pos-accuracy>`_
-``fr`` French             217         0.94  on UD FR-GSD
-``ga`` Irish              383
-``gd`` Gaelic             48
-``gl`` Galician           384
-``gv`` Manx               62
-``hu`` Hungarian          458
-``hy`` Armenian           323
-``id`` Indonesian         17          0.91  on UD ID-CSUI
-``it`` Italian            333         0.93  on UD IT-ISDT
-``ka`` Georgian           65
-``la`` Latin              850
-``lb`` Luxembourgish      305
-``lt`` Lithuanian         247
-``lv`` Latvian            168
-``mk`` Macedonian         57
-``nb`` Norwegian (Bokmål) 617
-``nl`` Dutch              254         0.91  on UD-NL-Alpino
-``pl`` Polish             3,733       0.91	on UD-PL-PDB
-``pt`` Portuguese         933         0.92  on UD-PT-GSD
-``ro`` Romanian           311
-``ru`` Russian            607               alternative: `pymorphy2 <https://github.com/kmike/pymorphy2/>`_
-``sk`` Slovak             846         0.92  on UD SK-SNK
-``sl`` Slovenian          97                low coverage
-``sv`` Swedish            658               alternative: `lemmy <https://github.com/sorenlind/lemmy>`_
-``tr`` Turkish            1,333       0.88  on UD-TR-Boun
-``uk`` Ukrainian          190               alternative: `pymorphy2 <https://github.com/kmike/pymorphy2/>`_
-====== ================== =========== ===== =========================================================================
+======= ==================== =========== ===== ========================================================================
+Available languages (2022-09-05)
+-----------------------------------------------------------------------------------------------------------------------
+Code    Language             Forms (10³) Acc.  Comments
+======= ==================== =========== ===== ========================================================================
+``bg``  Bulgarian            213
+``ca``  Catalan              579
+``cs``  Czech                187         0.88  on UD CS-PDT
+``cy``  Welsh                360
+``da``  Danish               554         0.92  on UD DA-DDT, alternative: `lemmy <https://github.com/sorenlind/lemmy>`_
+``de``  German               682         0.95  on UD DE-GSD, see also `German-NLP list <https://github.com/adbar/German-NLP#Lemmatization>`_
+``el``  Greek                183         0.88  on UD EL-GDT
+``en``  English              136         0.94  on UD EN-GUM, alternative: `LemmInflect <https://github.com/bjascob/LemmInflect>`_
+``enm`` Middle English       38
+``es``  Spanish              720         0.94  on UD ES-GSD
+``et``  Estonian             133               low coverage
+``fa``  Persian              10                experimental
+``fi``  Finnish              2,106             evaluation and alternatives: see `this benchmark <https://github.com/aajanki/finnish-pos-accuracy>`_
+``fr``  French               217         0.94  on UD FR-GSD
+``ga``  Irish                383
+``gd``  Gaelic               48
+``gl``  Galician             384
+``gv``  Manx                 62
+``hbs`` Serbo-Croatian       838               Croatian and Serbian lists to be added later
+``hi``  Hindi                58                experimental
+``hu``  Hungarian            458
+``hy``  Armenian             323
+``id``  Indonesian           17          0.91  on UD ID-CSUI
+``is``  Icelandic            175
+``it``  Italian              333         0.93  on UD IT-ISDT
+``ka``  Georgian             65
+``la``  Latin                850
+``lb``  Luxembourgish        305
+``lt``  Lithuanian           247
+``lv``  Latvian              168
+``mk``  Macedonian           57
+``ms``  Malay                14
+``nb``  Norwegian (Bokmål)   617
+``nl``  Dutch                254         0.91  on UD-NL-Alpino
+``nn``  Norwegian (Nynorsk)
+``pl``  Polish               3,733       0.91  on UD-PL-PDB
+``pt``  Portuguese           933         0.92  on UD-PT-GSD
+``ro``  Romanian             311
+``ru``  Russian              607               alternative: `pymorphy2 <https://github.com/kmike/pymorphy2/>`_
+``se``  Northern Sámi        113               experimental
+``sk``  Slovak               846         0.92  on UD SK-SNK
+``sl``  Slovene              136
+``sq``  Albanian             35
+``sv``  Swedish              658               alternative: `lemmy <https://github.com/sorenlind/lemmy>`_
+``sw``  Swahili              10                experimental
+``tl``  Tagalog              33                experimental
+``tr``  Turkish              1,333       0.88  on UD-TR-Boun
+``uk``  Ukrainian            190               alternative: `pymorphy2 <https://github.com/kmike/pymorphy2/>`_
+======= ==================== =========== ===== ========================================================================
 
 
 *Low coverage* mentions means one would probably be better off with a language-specific library, but *simplemma* will work to a limited extent. Open-source alternatives for Python are referenced if possible.
 
-The scores are calculated on `Universal Dependencies <https://universaldependencies.org/>`_ treebanks on single word tokens (including some contractions but not merged prepositions), they describe to what extent simplemma can accurately map tokens to their lemma form. They can be reproduced using the script ``udscore.py`` in the ``tests/`` folder.
+*Experimental* mentions indicate that the language remains untested or that there could be issues with the underlying data or lemmatization process.
 
-This library is particularly relevant as regards the lemmatization of less frequent words. Its performance in this case is only incidentally captured by the benchmark above.
+The scores are calculated on `Universal Dependencies <https://universaldependencies.org/>`_ treebanks on single word tokens (including some contractions but not merged prepositions), they describe to what extent simplemma can accurately map tokens to their lemma form. They can be reproduced by concatenating all available UD files and by using the script ``udscore.py`` in the ``tests/`` folder.
+
+This library is particularly relevant as regards the lemmatization of less frequent words. Its performance in this case is only incidentally captured by the benchmark above. In some languages, a fixed number of words such as pronouns can be further mapped by hand to enhance performance.
 
 
 Speed
@@ -275,21 +287,19 @@ Roadmap
 -  [ ] Integrate optional, more complex models?
 
 
-Credits
--------
+Credits and licenses
+--------------------
 
 Software under MIT license, for the linguistic information databases see ``licenses`` folder.
 
-The surface lookups (non-greedy mode) use lemmatization lists taken from various sources:
+The surface lookups (non-greedy mode) use lemmatization lists derived from various sources, ordered by relative importance:
 
 - `Lemmatization lists <https://github.com/michmech/lemmatization-lists>`_ by Michal Měchura (Open Database License)
+- Wiktionary entries packaged by the `Kaikki project <https://kaikki.org/>`_
 - `FreeLing project <https://github.com/TALP-UPC/FreeLing>`_
-- `spaCy lookups data <https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data>`_
-- Wiktionary entries parsed by the `Kaikki project <https://kaikki.org/>`_
+- `spaCy lookups data <https://github.com/explosion/spacy-lookups-data>`_
+- `Unimorph Project <https://unimorph.github.io/>`_
 - `Wikinflection corpus <https://github.com/lenakmeth/Wikinflection-Corpus>`_ by Eleni Metheniti (CC BY 4.0 License)
-- `Unimorph Project <http://unimorph.ethz.ch/languages>`_
-
-This rule-based approach based on flexion and lemmatizations dictionaries is to this day an approach used in popular libraries such as `spacy <https://spacy.io/usage/adding-languages#lemmatizer>`_.
 
 
 Contributions

diff --git a/setup.py b/setup.py
@@ -48,7 +48,7 @@ def get_version(package):
     author="Adrien Barbaresi",
     author_email="[email protected]",
     python_requires=">=3.6",
-    classifiers=[
+    classifiers=[  # https://pypi.org/classifiers/
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",
         "Intended Audience :: Education",
@@ -57,6 +57,7 @@ def get_version(package):
         "License :: OSI Approved :: MIT License",
         "Natural Language :: Bulgarian",
         "Natural Language :: Catalan",
+        "Natural Language :: Croatian",
         "Natural Language :: Czech",
         "Natural Language :: Danish",
         "Natural Language :: Dutch",
@@ -66,14 +67,17 @@ def get_version(package):
         "Natural Language :: Galician",
         "Natural Language :: German",
         "Natural Language :: Greek",
+        "Natural Language :: Hindi",
         "Natural Language :: Hungarian",
+        "Natural Language :: Icelandic",
         "Natural Language :: Indonesian",
         "Natural Language :: Irish",
         "Natural Language :: Italian",
         "Natural Language :: Latin",
         "Natural Language :: Latvian",
         "Natural Language :: Lithuanian",
         "Natural Language :: Macedonian",
+        "Natural Language :: Malay",
         "Natural Language :: Norwegian",
         "Natural Language :: Polish",
         "Natural Language :: Portuguese",
@@ -83,8 +87,10 @@ def get_version(package):
         "Natural Language :: Slovenian",
         "Natural Language :: Spanish",
         "Natural Language :: Swedish",
+        "Natural Language :: Thai",
         "Natural Language :: Turkish",
         "Natural Language :: Ukrainian",
+        "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
@@ -93,7 +99,10 @@ def get_version(package):
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
         "Topic :: Scientific/Engineering :: Information Analysis",
+        "Topic :: Software Development :: Internationalization",
+        "Topic :: Software Development :: Localization",
         "Topic :: Text Processing :: Linguistic",
+        "Typing :: Typed",
     ],
     description="A simple multilingual lemmatizer for Python.",
     install_requires=requirements,

diff --git a/simplemma/__init__.py b/simplemma/__init__.py
@@ -4,8 +4,9 @@
 __author__ = "Adrien Barbaresi"
 __email__ = "[email protected]"
 __license__ = "MIT"
-__version__ = "0.8.1"
+__version__ = "0.8.2"
 
 
+from .langdetect import in_target_language, lang_detector
 from .simplemma import lemmatize, lemma_iterator, text_lemmatizer, is_known
 from .tokenizer import simple_tokenizer
diff --git a/tests/test_langdetect.py b/tests/test_langdetect.py
@@ -33,3 +33,7 @@ def test_detection():
         )
         == 0.5
     )
+    assert (
+        in_target_language("opera post physica posita (τὰ μετὰ τὰ φυσικά)", lang="la")
+        == 0.5
+    )
diff --git a/tests/udscore.py b/tests/udscore.py
@@ -8,7 +8,7 @@
 
 data_files = [
     ("bg", "tests/UD/bg-btb-all.conllu"),
-    #              ('cs', 'tests/UD/cs-pdt-all.conllu'),
+    # ("cs", "tests/UD/cs-pdt-all.conllu"),  # longer to process
     ("da", "tests/UD/da-ddt-all.conllu"),
     ("de", "tests/UD/de-gsd-all.conllu"),
     ("el", "tests/UD/el-gdt-all.conllu"),
@@ -18,6 +18,7 @@
     ("fi", "tests/UD/fi-tdt-all.conllu"),
     ("fr", "tests/UD/fr-gsd-all.conllu"),
     ("ga", "tests/UD/ga-idt-all.conllu"),
+    ("hi", "tests/UD/hi-hdtb-all.conllu"),
     ("hu", "tests/UD/hu-szeged-all.conllu"),
     ("hy", "tests/UD/hy-armtdp-all.conllu"),
     ("id", "tests/UD/id-csui-all.conllu"),
@@ -37,7 +38,6 @@
 # doesn't work: right-to-left?
 # data_files = [
 #              ('he', 'tests/UD/he-htb-all.conllu'),
-#              ('hi', 'tests/UD/hi-hdtb-all.conllu'),
 #              ('ur', 'tests/UD/ur-udtb-all.conllu'),
 # ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,13 @@ History @@
     =======
+.8.2
+    -----
+    * languages added: Albanian, Hindi, Icelandic, Malay, Middle English, Northern Sámi, Nynorsk, Serbo-Croatian, Swahili, Tagalog
+    * fix for slow language detection introduced in 0.7.0
 .8.1
     -----
@@ Expand Down @@