From fc27ae8860eb86631ca4e3e090f00de5c1a3106e Mon Sep 17 00:00:00 2001 From: Diego Date: Thu, 8 Apr 2021 19:08:27 -0400 Subject: [PATCH 01/39] Add progress bar to lmf.load (#46) --- wn/_add.py | 24 ++++++++++++------------ wn/lmf.py | 30 +++++++++++++++++++++++++++--- wn/util.py | 15 ++++++++++----- 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/wn/_add.py b/wn/_add.py index 5053cb4..769266d 100644 --- a/wn/_add.py +++ b/wn/_add.py @@ -89,6 +89,7 @@ def add( if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message='Database') + progress_lmf = progress_handler(message='Process XML') logger.info('adding project to database') logger.info(' database: %s', wn.config.database_path) @@ -97,7 +98,7 @@ def add( try: for package in iterpackages(source): if package.type == _WORDNET: - _add_lmf(package.resource_file(), progress) + _add_lmf(package.resource_file(), progress, progress_lmf) elif package.type == _ILI: _add_ili(package.resource_file(), progress) else: @@ -109,6 +110,7 @@ def add( def _add_lmf( source, progress: ProgressHandler, + progress_lmf: ProgressHandler, ) -> None: with connect() as conn: cur = conn.cursor() @@ -138,10 +140,16 @@ def _add_lmf( _update_lookup_tables(all_infos, cur) progress.flash(f'Reading {source!s}') - for lexicon, info in zip(lmf.load(source), all_infos): + + total_items = sum(_sum_counts(info) for info in all_infos) + progress_lmf.set(count=0, + total=total_items, + refresh_interval=50, + status='Reading XML') + + for lexicon, info in zip(lmf.load(source, progress_lmf), all_infos): if 'skip' in info: continue - progress.set(count=0, total=_sum_counts(info)) synsets = lexicon.synsets entries = lexicon.lexical_entries @@ -188,15 +196,7 @@ def _precheck(source, cur): def _sum_counts(info) -> int: counts = info['counts'] - return sum(counts.get(name, 0) for name in - ('LexicalEntry', 'ExternalLexicalEntry', - 'Lemma', 'Form', 'Pronunciation', 'Tag', - 'Sense', 'ExternalSense', - 'SenseRelation', 'Example', 'Count', - 'SyntacticBehaviour', - 'Synset', 'ExternalSynset', - 'Definition', # 'ILIDefinition', - 'SynsetRelation')) + return sum(counts.get(name, 0) for name in lmf.LEXICON_INFO_ATTRIBUTES) def _update_lookup_tables(all_infos, cur): diff --git a/wn/lmf.py b/wn/lmf.py index a1c098e..ac102ef 100644 --- a/wn/lmf.py +++ b/wn/lmf.py @@ -31,6 +31,17 @@ PARTS_OF_SPEECH, LEXICOGRAPHER_FILES, ) +from wn.util import ProgressHandler + + +LEXICON_INFO_ATTRIBUTES = ('LexicalEntry', 'ExternalLexicalEntry', + 'Lemma', 'Form', 'Pronunciation', 'Tag', + 'Sense', 'ExternalSense', + 'SenseRelation', 'Example', 'Count', + 'SyntacticBehaviour', + 'Synset', 'ExternalSynset', + 'Definition', # 'ILIDefinition', + 'SynsetRelation') class LMFError(wn.Error): @@ -80,8 +91,11 @@ class LMFWarning(Warning): class XMLEventIterator: """etree.iterparse() event iterator with lookahead""" - def __init__(self, iterator: Iterator[Tuple[str, ET.Element]]): + def __init__(self, + iterator: Iterator[Tuple[str, ET.Element]], + progress: Optional[ProgressHandler]): self.iterator = iterator + self._progress = progress self._next = next(iterator, (None, None)) def __iter__(self): @@ -89,7 +103,10 @@ def __iter__(self): def __next__(self): _next = self._next + event, elem = _next if _next == (None, None): + if self._progress: + self._progress.set(status="Completed") raise StopIteration self._next = next(self.iterator, (None, None)) return _next @@ -114,6 +131,8 @@ def end(self, *tags: str) -> ET.Element: raise LMFError(f'expected , got <{elem.tag}>') if elem.tag not in tags: raise LMFError(f'expected , got ') + if self._progress and elem.tag in LEXICON_INFO_ATTRIBUTES: + self._progress.update() return elem @@ -550,7 +569,10 @@ def start(name, attrs): return infos -def load(source: AnyPath) -> LexicalResource: +def load( + source: AnyPath, + progress: Optional[ProgressHandler] = None +) -> LexicalResource: """Load wordnets encoded in the WN-LMF format. Args: @@ -561,7 +583,9 @@ def load(source: AnyPath) -> LexicalResource: with source.open('rb') as fh: version = _read_header(fh) - events = XMLEventIterator(ET.iterparse(source, events=('start', 'end'))) + events = XMLEventIterator( + ET.iterparse(source, events=('start', 'end')), + progress) root = events.start('LexicalResource') lexicons: List[Lexicon] = [] diff --git a/wn/util.py b/wn/util.py index 55a4959..2268162 100644 --- a/wn/util.py +++ b/wn/util.py @@ -1,5 +1,4 @@ """Wn utility classes.""" - from typing import TextIO import sys @@ -31,6 +30,7 @@ def __init__( message: str = '', count: int = 0, total: int = 0, + refresh_interval: int = 1, unit: str = '', status: str = '', file: TextIO = sys.stderr, @@ -39,10 +39,12 @@ def __init__( self.kwargs = { 'count': count, 'total': total, + 'refresh_interval': refresh_interval, 'message': message, 'unit': unit, 'status': status, } + self._refresh_quota: int = refresh_interval def update(self, n: int = 1) -> None: """Update the counter with the increment value *n*. @@ -103,10 +105,13 @@ class ProgressBar(ProgressHandler): def update(self, n: int = 1) -> None: """Increment the count by *n* and print the reformatted bar.""" self.kwargs['count'] += n # type: ignore - s = self.format() - if self.file: - print('\r\033[K', end='', file=self.file) - print(s, end='', file=self.file) + self._refresh_quota -= n + if self._refresh_quota <= 0: + self._refresh_quota = self.kwargs['refresh_interval'] # type: ignore + s = self.format() + if self.file: + print('\r\033[K', end='', file=self.file) + print(s, end='', file=self.file) def format(self) -> str: """Format and return the progress bar. From 62e8f7db09486f25c5e5526f8c07a44e33156cdd Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 12 Apr 2021 13:19:33 +0800 Subject: [PATCH 02/39] Reduce work when scanning lexicons --- wn/_add.py | 19 +++++++++++++------ wn/lmf.py | 11 ++++------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/wn/_add.py b/wn/_add.py index 769266d..b1bf1a6 100644 --- a/wn/_add.py +++ b/wn/_add.py @@ -136,9 +136,6 @@ def _add_lmf( return # all clear, try to add them - progress.flash('Updating lookup tables') - _update_lookup_tables(all_infos, cur) - progress.flash(f'Reading {source!s}') total_items = sum(_sum_counts(info) for info in all_infos) @@ -150,6 +147,10 @@ def _add_lmf( for lexicon, info in zip(lmf.load(source, progress_lmf), all_infos): if 'skip' in info: continue + + progress.flash('Updating lookup tables') + _update_lookup_tables(lexicon, cur) + progress.set(count=0, total=_sum_counts(info)) synsets = lexicon.synsets entries = lexicon.lexical_entries @@ -199,11 +200,17 @@ def _sum_counts(info) -> int: return sum(counts.get(name, 0) for name in lmf.LEXICON_INFO_ATTRIBUTES) -def _update_lookup_tables(all_infos, cur): - reltypes = set(rt for info in all_infos for rt in info['relations']) +def _update_lookup_tables(lexicon, cur): + reltypes = set(rel.type + for ss in lexicon.synsets + for rel in ss.relations) + reltypes.update(rel.type + for e in lexicon.lexical_entries + for s in e.senses + for rel in s.relations) cur.executemany('INSERT OR IGNORE INTO relation_types VALUES (null,?)', [(rt,) for rt in sorted(reltypes)]) - lexfiles = set(lf for info in all_infos for lf in info['lexfiles']) + lexfiles = set(ss.lexfile for ss in lexicon.synsets) - {None} cur.executemany('INSERT OR IGNORE INTO lexfiles VALUES (null,?)', [(lf,) for lf in sorted(lexfiles)]) diff --git a/wn/lmf.py b/wn/lmf.py index ac102ef..817b09b 100644 --- a/wn/lmf.py +++ b/wn/lmf.py @@ -545,18 +545,15 @@ def scan_lexicons(source: AnyPath) -> List[Dict]: def start(name, attrs): if name in ('Lexicon', 'LexiconExtension'): attrs['counts'] = {} - attrs['relations'] = set() - attrs['lexfiles'] = set() infos.append(attrs) elif name == 'Extends': infos[-1]['extends'] = attrs['id'], attrs['version'] elif infos: - if name in ('SynsetRelation', 'SenseRelation'): - infos[-1]['relations'].add(attrs['relType']) - elif name == 'Synset' and 'lexfile' in attrs: - infos[-1]['lexfiles'].add(attrs['lexfile']) counts = infos[-1]['counts'] - counts[name] = counts.get(name, 0) + 1 + if name in counts: + counts[name] += 1 + else: + counts[name] = 1 p = xml.parsers.expat.ParserCreate() p.StartElementHandler = start From db479369bcc9ee2b8ec808e5e0cdbcce5fec32d3 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 12 Apr 2021 13:50:43 +0800 Subject: [PATCH 03/39] Optimize lmf.load() progress bar a bit --- wn/_add.py | 2 +- wn/lmf.py | 31 +++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/wn/_add.py b/wn/_add.py index b1bf1a6..a9dca01 100644 --- a/wn/_add.py +++ b/wn/_add.py @@ -141,7 +141,7 @@ def _add_lmf( total_items = sum(_sum_counts(info) for info in all_infos) progress_lmf.set(count=0, total=total_items, - refresh_interval=50, + refresh_interval=10000, status='Reading XML') for lexicon, info in zip(lmf.load(source, progress_lmf), all_infos): diff --git a/wn/lmf.py b/wn/lmf.py index 817b09b..4d35487 100644 --- a/wn/lmf.py +++ b/wn/lmf.py @@ -34,14 +34,16 @@ from wn.util import ProgressHandler -LEXICON_INFO_ATTRIBUTES = ('LexicalEntry', 'ExternalLexicalEntry', - 'Lemma', 'Form', 'Pronunciation', 'Tag', - 'Sense', 'ExternalSense', - 'SenseRelation', 'Example', 'Count', - 'SyntacticBehaviour', - 'Synset', 'ExternalSynset', - 'Definition', # 'ILIDefinition', - 'SynsetRelation') +LEXICON_INFO_ATTRIBUTES = { + 'LexicalEntry', 'ExternalLexicalEntry', + 'Lemma', 'Form', 'Pronunciation', 'Tag', + 'Sense', 'ExternalSense', + 'SenseRelation', 'Example', 'Count', + 'SyntacticBehaviour', + 'Synset', 'ExternalSynset', + 'Definition', # 'ILIDefinition', + 'SynsetRelation' +} class LMFError(wn.Error): @@ -91,9 +93,11 @@ class LMFWarning(Warning): class XMLEventIterator: """etree.iterparse() event iterator with lookahead""" - def __init__(self, - iterator: Iterator[Tuple[str, ET.Element]], - progress: Optional[ProgressHandler]): + def __init__( + self, + iterator: Iterator[Tuple[str, ET.Element]], + progress: ProgressHandler + ): self.iterator = iterator self._progress = progress self._next = next(iterator, (None, None)) @@ -105,8 +109,7 @@ def __next__(self): _next = self._next event, elem = _next if _next == (None, None): - if self._progress: - self._progress.set(status="Completed") + self._progress.set(status="Completed") raise StopIteration self._next = next(self.iterator, (None, None)) return _next @@ -131,7 +134,7 @@ def end(self, *tags: str) -> ET.Element: raise LMFError(f'expected , got <{elem.tag}>') if elem.tag not in tags: raise LMFError(f'expected , got ') - if self._progress and elem.tag in LEXICON_INFO_ATTRIBUTES: + if elem.tag in LEXICON_INFO_ATTRIBUTES: self._progress.update() return elem From bf25707c66d4d5060fb2bca1e4e81bd3f92c1844 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 13 Apr 2021 09:34:08 +0800 Subject: [PATCH 04/39] Improve messaging on XML-load progress bar --- wn/_add.py | 7 ++----- wn/lmf.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/wn/_add.py b/wn/_add.py index a9dca01..1db75d0 100644 --- a/wn/_add.py +++ b/wn/_add.py @@ -89,7 +89,7 @@ def add( if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message='Database') - progress_lmf = progress_handler(message='Process XML') + progress_lmf = progress_handler(message='Read') logger.info('adding project to database') logger.info(' database: %s', wn.config.database_path) @@ -139,10 +139,7 @@ def _add_lmf( progress.flash(f'Reading {source!s}') total_items = sum(_sum_counts(info) for info in all_infos) - progress_lmf.set(count=0, - total=total_items, - refresh_interval=10000, - status='Reading XML') + progress_lmf.set(count=0, total=total_items, refresh_interval=10000) for lexicon, info in zip(lmf.load(source, progress_lmf), all_infos): if 'skip' in info: diff --git a/wn/lmf.py b/wn/lmf.py index 4d35487..b59e5ff 100644 --- a/wn/lmf.py +++ b/wn/lmf.py @@ -99,7 +99,7 @@ def __init__( progress: ProgressHandler ): self.iterator = iterator - self._progress = progress + self.progress = progress self._next = next(iterator, (None, None)) def __iter__(self): @@ -109,7 +109,7 @@ def __next__(self): _next = self._next event, elem = _next if _next == (None, None): - self._progress.set(status="Completed") + self.progress.set(status="Complete") raise StopIteration self._next = next(self.iterator, (None, None)) return _next @@ -135,7 +135,7 @@ def end(self, *tags: str) -> ET.Element: if elem.tag not in tags: raise LMFError(f'expected , got ') if elem.tag in LEXICON_INFO_ATTRIBUTES: - self._progress.update() + self.progress.update() return elem @@ -585,7 +585,8 @@ def load( events = XMLEventIterator( ET.iterparse(source, events=('start', 'end')), - progress) + progress + ) root = events.start('LexicalResource') lexicons: List[Lexicon] = [] @@ -614,9 +615,13 @@ def _load_lexicon(events, version) -> Lexicon: requires.append(_load_dependency(events, 'Requires')) attrs = lex_root.attrib + events.progress.set(message=f'Read {attrs["id"]}:{attrs["version"]}') + + events.progress.set(status='Lexical Entries') entries, frames, sbmap = _load_lexical_entries( events, extension, version, lex_root ) + events.progress.set(status='Synsets') synsets = _load_synsets( events, extension, version, lex_root ) From 210a73613f8086aebc2cbb18e9f43e4578847805 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 13 Apr 2021 10:49:03 +0800 Subject: [PATCH 05/39] Make lmf.load() API more consistent Other long-running functions take a progress_handler class instead of an instantiated handler. Now lmf.load() does the same. This means that lmf.load() counts the things for its progress bar with a simple str.count() over the file contents, which is 10x faster than scan_lexicons(). --- wn/_add.py | 10 +++------- wn/lmf.py | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/wn/_add.py b/wn/_add.py index 1db75d0..5666569 100644 --- a/wn/_add.py +++ b/wn/_add.py @@ -89,7 +89,6 @@ def add( if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message='Database') - progress_lmf = progress_handler(message='Read') logger.info('adding project to database') logger.info(' database: %s', wn.config.database_path) @@ -98,7 +97,7 @@ def add( try: for package in iterpackages(source): if package.type == _WORDNET: - _add_lmf(package.resource_file(), progress, progress_lmf) + _add_lmf(package.resource_file(), progress, progress_handler) elif package.type == _ILI: _add_ili(package.resource_file(), progress) else: @@ -110,7 +109,7 @@ def add( def _add_lmf( source, progress: ProgressHandler, - progress_lmf: ProgressHandler, + progress_handler: Type[ProgressHandler], ) -> None: with connect() as conn: cur = conn.cursor() @@ -138,10 +137,7 @@ def _add_lmf( # all clear, try to add them progress.flash(f'Reading {source!s}') - total_items = sum(_sum_counts(info) for info in all_infos) - progress_lmf.set(count=0, total=total_items, refresh_interval=10000) - - for lexicon, info in zip(lmf.load(source, progress_lmf), all_infos): + for lexicon, info in zip(lmf.load(source, progress_handler), all_infos): if 'skip' in info: continue diff --git a/wn/lmf.py b/wn/lmf.py index b59e5ff..1f880ca 100644 --- a/wn/lmf.py +++ b/wn/lmf.py @@ -4,6 +4,7 @@ """ from typing import ( + Type, Container, List, Tuple, @@ -31,7 +32,7 @@ PARTS_OF_SPEECH, LEXICOGRAPHER_FILES, ) -from wn.util import ProgressHandler +from wn.util import ProgressHandler, ProgressBar LEXICON_INFO_ATTRIBUTES = { @@ -134,8 +135,7 @@ def end(self, *tags: str) -> ET.Element: raise LMFError(f'expected , got <{elem.tag}>') if elem.tag not in tags: raise LMFError(f'expected , got ') - if elem.tag in LEXICON_INFO_ATTRIBUTES: - self.progress.update() + self.progress.update() return elem @@ -571,18 +571,27 @@ def start(name, attrs): def load( source: AnyPath, - progress: Optional[ProgressHandler] = None + progress_handler: Optional[Type[ProgressHandler]] = ProgressBar ) -> LexicalResource: """Load wordnets encoded in the WN-LMF format. Args: source: path to a WN-LMF file """ + if progress_handler is None: + progress_handler = ProgressHandler + source = Path(source).expanduser() with source.open('rb') as fh: version = _read_header(fh) + # _read_header() only reads the first 2 lines + remainder = fh.read() + total_elements = remainder.count(b'') + progress = progress_handler( + message='Read', total=total_elements, refresh_interval=10000 + ) events = XMLEventIterator( ET.iterparse(source, events=('start', 'end')), progress From 9f000feaab74061a140c5c3f8ab4a35f15ef39fe Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 13 Apr 2021 10:53:34 +0800 Subject: [PATCH 06/39] Fix forced refreshes of ProgressBar The new refresh_interval on ProgressBar meant that bar.update(0) (including bar.set(...)) no longer triggered a refresh. Now the default refresh_interval is 0 and the update() method takes a force=True argument to ensure the indicator is refreshed. --- wn/util.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/wn/util.py b/wn/util.py index 2268162..2d50acc 100644 --- a/wn/util.py +++ b/wn/util.py @@ -30,7 +30,7 @@ def __init__( message: str = '', count: int = 0, total: int = 0, - refresh_interval: int = 1, + refresh_interval: int = 0, unit: str = '', status: str = '', file: TextIO = sys.stderr, @@ -46,13 +46,16 @@ def __init__( } self._refresh_quota: int = refresh_interval - def update(self, n: int = 1) -> None: + def update(self, n: int = 1, force: bool = False) -> None: """Update the counter with the increment value *n*. This method should update the ``count`` key of :attr:`kwargs` with the increment value *n*. After this, it is expected to update some user-facing progress indicator. + If *force* is :python:`True`, any indicator will be refreshed + regardless of the value of the refresh interval. + """ self.kwargs['count'] += n # type: ignore @@ -65,7 +68,7 @@ def set(self, **kwargs) -> None: """ self.kwargs.update(**kwargs) - self.update(0) + self.update(0, force=True) def flash(self, message: str) -> None: """Issue a message unrelated to the current counter. @@ -102,11 +105,11 @@ class ProgressBar(ProgressHandler): #: The default formatting template. FMT = '\r{message}{bar}{counter}{status}' - def update(self, n: int = 1) -> None: + def update(self, n: int = 1, force: bool = False) -> None: """Increment the count by *n* and print the reformatted bar.""" self.kwargs['count'] += n # type: ignore self._refresh_quota -= n - if self._refresh_quota <= 0: + if force or self._refresh_quota <= 0: self._refresh_quota = self.kwargs['refresh_interval'] # type: ignore s = self.format() if self.file: From 0a4a3576209a4a1d682c6ce7dc282a9041795719 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 13 Apr 2021 11:06:25 +0800 Subject: [PATCH 07/39] Add CHANGELOG entries --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f39466a..d0d3bf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ ## [Unreleased] +### Changed + +* `wn.lmf.load()` now takes a `progress_handler` parameter ([#46]) +* `wn.lmf.scan_lexicons()` no longer returns sets of relation types or + lexfiles; `wn.add()` now gets these from loaded lexicons instead +* `wn.util.ProgressHandler` + - Now has a `refresh_interval` parameter; updates only trigger a + refresh after the counter hits the threshold set by the interval + - The `update()` method now takes a `force` parameter to trigger a + refresh regardless of the refresh interval + ## [v0.6.2] From 987cd2aceeee6310682f8573dfc1a2b137714435 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 14 Apr 2021 12:49:40 +0800 Subject: [PATCH 08/39] Fix #105: Add support for approximate word search --- CHANGELOG.md | 11 +++ wn/_core.py | 52 ++++++++++++-- wn/_queries.py | 190 ++++++++++++++++++++++++++++--------------------- wn/_types.py | 5 +- 4 files changed, 170 insertions(+), 88 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0d3bf2..7b6f5a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ ## [Unreleased] +### Added + +* Support for approximate word searches; on by default, configurable + only by instantiating a `wn.Wordnet` object ([#105]) + ### Changed * `wn.lmf.load()` now takes a `progress_handler` parameter ([#46]) @@ -12,6 +17,11 @@ refresh after the counter hits the threshold set by the interval - The `update()` method now takes a `force` parameter to trigger a refresh regardless of the refresh interval +* `wn.Wordnet` + - Initialization now takes a `normalize` parameter ([#105]) + - `Wordnet.words()`, `Wordnet.senses()` and `Wordnet.synsets()` now + use a specified normalization function to expand queries on word + forms ([#105]) ## [v0.6.2] @@ -339,6 +349,7 @@ abandoned, but this is an entirely new codebase. [#15]: https://github.com/goodmami/wn/issues/15 [#17]: https://github.com/goodmami/wn/issues/17 [#23]: https://github.com/goodmami/wn/issues/23 +[#46]: https://github.com/goodmami/wn/issues/46 [#47]: https://github.com/goodmami/wn/issues/47 [#58]: https://github.com/goodmami/wn/issues/58 [#59]: https://github.com/goodmami/wn/issues/59 diff --git a/wn/_core.py b/wn/_core.py index 534d84e..d88aafc 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -3,8 +3,8 @@ import warnings import wn -from wn._types import Metadata -from wn._util import flatten +from wn._types import Metadata, NormalizeFunction +from wn._util import flatten, normalize_form from wn._db import NON_ROWID from wn._queries import ( find_lexicons, @@ -1041,13 +1041,30 @@ class Wordnet: second space-separated list of lexicon specifiers which are used for traversing relations, but not as the results of queries. + The *normalize* argument takes a function that normalizes word + forms in order to expand the search. The default function + downcases the word and removes diacritics via NFKD_ normalization + so that, for example, searching for *san josé* in the English + WordNet will find the entry for *San Jose*. Setting *normalize* to + :python:`None` disables normalization and forces exact-match + searching. + + .. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms + """ __slots__ = ('_lexicons', '_lexicon_ids', '_expanded', '_expanded_ids', - '_default_mode') + '_default_mode', '_normalize') __module__ = 'wn' - def __init__(self, lexicon: str = None, *, lang: str = None, expand: str = None): + def __init__( + self, + lexicon: str = None, + *, + lang: str = None, + expand: str = None, + normalize: Optional[NormalizeFunction] = normalize_form, + ): # default mode means any lexicon is searched or expanded upon, # but relation traversals only target the source's lexicon self._default_mode = (not lexicon and not lang) @@ -1081,6 +1098,8 @@ def __init__(self, lexicon: str = None, *, lang: str = None, expand: str = None) self._expanded = tuple(map(_to_lexicon, find_lexicons(lexicon=expand))) self._expanded_ids: Tuple[int, ...] = tuple(lx._id for lx in self._expanded) + self._normalize = normalize + def lexicons(self): """Return the list of lexicons covered by this wordnet.""" return self._lexicons @@ -1106,7 +1125,10 @@ def words(self, form: str = None, pos: str = None) -> List[Word]: restricts words by their part of speech. """ - iterable = find_entries(form=form, pos=pos, lexicon_rowids=self._lexicon_ids) + forms = _expand_form(form, self._normalize) + iterable = find_entries( + forms=forms, pos=pos, lexicon_rowids=self._lexicon_ids + ) return [Word(*word_data, self) for word_data in iterable] def synset(self, id: str) -> Synset: @@ -1131,8 +1153,9 @@ def synsets( select a unique synset within a single lexicon. """ + forms = _expand_form(form, self._normalize) iterable = find_synsets( - form=form, pos=pos, ili=ili, lexicon_rowids=self._lexicon_ids, + forms=forms, pos=pos, ili=ili, lexicon_rowids=self._lexicon_ids, ) return [Synset(*synset_data, self) for synset_data in iterable] @@ -1153,7 +1176,10 @@ def senses(self, form: str = None, pos: str = None) -> List[Sense]: *pos* restricts senses by their word's part of speech. """ - iterable = find_senses(form=form, pos=pos, lexicon_rowids=self._lexicon_ids) + forms = _expand_form(form, self._normalize) + iterable = find_senses( + forms=forms, pos=pos, lexicon_rowids=self._lexicon_ids + ) return [Sense(*sense_data, self) for sense_data in iterable] def ili(self, id: str) -> ILI: @@ -1185,6 +1211,18 @@ def _to_lexicon(data) -> Lexicon: ) +def _expand_form( + form: Optional[str], + normalize: Optional[NormalizeFunction] +) -> Optional[List[str]]: + if form is None: + return None + forms = [form] + if normalize: + forms.extend([normalize(f) for f in forms]) + return forms + + def projects() -> List[Dict]: """Return the list of indexed projects. diff --git a/wn/_queries.py b/wn/_queries.py index bc9b254..80dc508 100644 --- a/wn/_queries.py +++ b/wn/_queries.py @@ -3,7 +3,7 @@ """ from typing import ( - Optional, Any, Dict, Set, List, Tuple, Collection, Iterator, Sequence + Optional, Dict, Set, List, Tuple, Collection, Iterator, Sequence ) import itertools import sqlite3 @@ -281,131 +281,160 @@ def find_proposed_ilis( def find_entries( id: str = None, - form: str = None, + forms: Sequence[str] = None, pos: str = None, lexicon_rowids: Sequence[int] = None, ) -> Iterator[_Word]: conn = connect() - query_parts = [ - 'SELECT DISTINCT e.lexicon_rowid, e.rowid, e.id, e.pos,' - ' f.form, f.id, f.script, f.rowid', - ' FROM entries AS e', - ' JOIN forms AS f ON f.entry_rowid = e.rowid', - ] - - params: Dict[str, Any] = {'id': id, 'form': form, 'pos': pos} + cte = '' + params: List = [] conditions = [] if id: - conditions.append('e.id = :id') - if form: - conditions.append('e.rowid IN' - ' (SELECT entry_rowid FROM forms WHERE form = :form)') + conditions.append('e.id = ?') + params.append(id) + if forms: + cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' + conditions.append(''' + e.rowid IN + (SELECT entry_rowid + FROM forms + WHERE form IN wordforms + OR normalized_form IN wordforms) + '''.strip()) + params.extend(forms) if pos: - conditions.append('e.pos = :pos') + conditions.append('e.pos = ?') + params.append(pos) if lexicon_rowids: - kws = {f'lex{i}': rowid for i, rowid in enumerate(lexicon_rowids, 1)} - params.update(kws) - conditions.append(f'e.lexicon_rowid IN ({_kws(kws)})') + conditions.append(f'e.lexicon_rowid IN ({_qs(lexicon_rowids)})') + params.extend(lexicon_rowids) + condition = '' if conditions: - query_parts.append(' WHERE ' + '\n AND '.join(conditions)) + condition = 'WHERE ' + '\n AND '.join(conditions) - query_parts.append(' ORDER BY e.rowid, e.id, f.rank') + query = f''' + {cte} + SELECT DISTINCT e.lexicon_rowid, e.rowid, e.id, e.pos, + f.form, f.id, f.script, f.rowid + FROM entries AS e + JOIN forms AS f ON f.entry_rowid = e.rowid + {condition} + ORDER BY e.rowid, e.id, f.rank + ''' - query = '\n'.join(query_parts) rows: Iterator[ Tuple[int, int, str, str, str, Optional[str], Optional[str], int] ] = conn.execute(query, params) groupby = itertools.groupby for key, group in groupby(rows, lambda row: row[0:4]): lexid, rowid, id, pos = key - forms = [(row[4], row[5], row[6], row[7]) for row in group] - yield (id, pos, forms, lexid, rowid) + wordforms = [(row[4], row[5], row[6], row[7]) for row in group] + yield (id, pos, wordforms, lexid, rowid) def find_senses( - id: str = None, - form: str = None, - pos: str = None, - lexicon_rowids: Sequence[int] = None, + id: str = None, + forms: Sequence[str] = None, + pos: str = None, + lexicon_rowids: Sequence[int] = None, ) -> Iterator[_Sense]: conn = connect() - query_parts = [ - 'SELECT DISTINCT s.id, e.id, ss.id, s.lexicon_rowid, s.rowid' - ' FROM senses AS s' - ' JOIN entries AS e ON e.rowid = s.entry_rowid' - ' JOIN synsets AS ss ON ss.rowid = s.synset_rowid' - ] - - params: Dict[str, Any] = {'id': id, 'form': form, 'pos': pos} + cte = '' + params: List = [] conditions = [] if id: - conditions.append('s.id = :id') - if form: - conditions.append('s.entry_rowid IN' - ' (SELECT entry_rowid FROM forms WHERE form = :form)') + conditions.append('s.id = ?') + params.append(id) + if forms: + cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' + conditions.append(''' + s.entry_rowid IN + (SELECT entry_rowid + FROM forms + WHERE form IN wordforms + OR normalized_form IN wordforms) + '''.strip()) + params.extend(forms) if pos: - conditions.append('e.pos = :pos') + conditions.append('e.pos = ?') + params.append(pos) if lexicon_rowids: - kws = {f'lex{i}': rowid for i, rowid in enumerate(lexicon_rowids, 1)} - params.update(kws) - conditions.append(f's.lexicon_rowid IN ({_kws(kws)})') + conditions.append(f's.lexicon_rowid IN ({_qs(lexicon_rowids)})') + params.extend(lexicon_rowids) + condition = '' if conditions: - query_parts.append(' WHERE ' + '\n AND '.join(conditions)) + condition = 'WHERE ' + '\n AND '.join(conditions) + + query = f''' + {cte} + SELECT DISTINCT s.id, e.id, ss.id, s.lexicon_rowid, s.rowid + FROM senses AS s + JOIN entries AS e ON e.rowid = s.entry_rowid + JOIN synsets AS ss ON ss.rowid = s.synset_rowid + {condition} + ''' - query = '\n'.join(query_parts) rows: Iterator[_Sense] = conn.execute(query, params) yield from rows def find_synsets( - id: str = None, - form: str = None, - pos: str = None, - ili: str = None, - lexicon_rowids: Sequence[int] = None, + id: str = None, + forms: Sequence[str] = None, + pos: str = None, + ili: str = None, + lexicon_rowids: Sequence[int] = None, ) -> Iterator[_Synset]: conn = connect() - query_parts = [ - 'SELECT DISTINCT ss.id, ss.pos,', - ' (SELECT ilis.id FROM ilis WHERE ilis.rowid=ss.ili_rowid),', - ' ss.lexicon_rowid, ss.rowid', - ' FROM synsets AS ss', - ] - - params: Dict[str, Any] = {'id': id, 'form': form, 'pos': pos, 'ili': ili} + cte = '' + join = '' conditions = [] + order = '' + params: List = [] if id: - conditions.append('ss.id = :id') - if form: - query_parts.extend([ - ' JOIN (SELECT _s.synset_rowid, _s.entry_rowid, _s.entry_rank', - ' FROM senses AS _s', - ' JOIN forms AS f', - ' ON f.entry_rowid = _s.entry_rowid', - ' WHERE f.form = :form) AS s', - ' ON s.synset_rowid = ss.rowid', - ]) + conditions.append('ss.id = ?') + params.append(id) + if forms: + cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' + join = '''\ + JOIN (SELECT _s.entry_rowid, _s.synset_rowid, _s.entry_rank + FROM forms AS f + JOIN senses AS _s ON _s.entry_rowid = f.entry_rowid + WHERE f.form IN wordforms + OR f.normalized_form IN wordforms) AS s + ON s.synset_rowid = ss.rowid + '''.strip() + params.extend(forms) + order = 'ORDER BY s.entry_rowid, s.entry_rank' if pos: - conditions.append('ss.pos = :pos') + conditions.append('ss.pos = ?') + params.append(pos) if ili: - query_parts.extend([ - ' JOIN (SELECT _i.rowid FROM ilis AS _i WHERE _i.id=:ili) AS ili', - ' ON ss.ili_rowid = ili.rowid', - ]) + conditions.append( + 'ss.ili_rowid IN (SELECT ilis.rowid FROM ilis WHERE ilis.id = ?)' + ) + params.append(ili) if lexicon_rowids: - kws = {f'lex{i}': rowid for i, rowid in enumerate(lexicon_rowids, 1)} - params.update(kws) - conditions.append(f'ss.lexicon_rowid IN ({_kws(kws)})') + conditions.append(f'ss.lexicon_rowid IN ({_qs(lexicon_rowids)})') + params.extend(lexicon_rowids) + condition = '' if conditions: - query_parts.append(' WHERE ' + '\n AND '.join(conditions)) + condition = 'WHERE ' + '\n AND '.join(conditions) - if form: - query_parts.append(' ORDER BY s.entry_rowid, s.entry_rank') + query = f''' + {cte} + SELECT DISTINCT ss.id, ss.pos, + (SELECT ilis.id FROM ilis WHERE ilis.rowid=ss.ili_rowid), + ss.lexicon_rowid, ss.rowid + FROM synsets AS ss + {join} + {condition} + {order} + ''' - query = '\n'.join(query_parts) rows: Iterator[_Synset] = conn.execute(query, params) yield from rows @@ -743,4 +772,5 @@ def get_lexfile(synset_rowid: int) -> Optional[str]: def _qs(xs: Collection) -> str: return ','.join('?' * len(xs)) +def _vs(xs: Collection) -> str: return ','.join(['(?)'] * len(xs)) def _kws(xs: Collection) -> str: return ','.join(f':{x}' for x in xs) diff --git a/wn/_types.py b/wn/_types.py index a589f90..a2709f6 100644 --- a/wn/_types.py +++ b/wn/_types.py @@ -1,5 +1,5 @@ -from typing import Union, Mapping, Sequence, Dict, Any +from typing import Union, Callable, Mapping, Sequence, Dict, Any from pathlib import Path # For functions taking a filesystem path as a str or a pathlib.Path @@ -10,3 +10,6 @@ # User-facing metadata representation Metadata = Dict[str, Any] + +# A function that returns a normalized word form for a given word form +NormalizeFunction = Callable[[str], str] From 0818a4c3c85a65eaaab175a44fa24cd412f4e6cf Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 14 Apr 2021 17:16:39 +0800 Subject: [PATCH 09/39] Fix #19: Add implementation of Morphy Some things are not done, however: * processing of multi-word expressions, hyphenation, -ful, etc. * interfacing with wordnet queries * unit tests --- CHANGELOG.md | 2 + docs/api/wn.morphy.rst | 7 +++ docs/index.rst | 1 + wn/morphy.py | 131 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 141 insertions(+) create mode 100644 docs/api/wn.morphy.rst create mode 100644 wn/morphy.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b6f5a5..f6538c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * Support for approximate word searches; on by default, configurable only by instantiating a `wn.Wordnet` object ([#105]) +* `wn.morphy` ([#19]) ### Changed @@ -348,6 +349,7 @@ abandoned, but this is an entirely new codebase. [#7]: https://github.com/goodmami/wn/issues/7 [#15]: https://github.com/goodmami/wn/issues/15 [#17]: https://github.com/goodmami/wn/issues/17 +[#19]: https://github.com/goodmami/wn/issues/19 [#23]: https://github.com/goodmami/wn/issues/23 [#46]: https://github.com/goodmami/wn/issues/46 [#47]: https://github.com/goodmami/wn/issues/47 diff --git a/docs/api/wn.morphy.rst b/docs/api/wn.morphy.rst new file mode 100644 index 0000000..cb0371e --- /dev/null +++ b/docs/api/wn.morphy.rst @@ -0,0 +1,7 @@ + +wn.morphy +========= + +.. automodule:: wn.morphy + +.. autoclass:: Morphy diff --git a/docs/index.rst b/docs/index.rst index 48dcc15..f4b3e19 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -58,6 +58,7 @@ Contents api/wn.rst api/wn.constants.rst api/wn.lmf.rst + api/wn.morphy.rst api/wn.project.rst api/wn.similarity.rst api/wn.util.rst diff --git a/wn/morphy.py b/wn/morphy.py new file mode 100644 index 0000000..6db4d79 --- /dev/null +++ b/wn/morphy.py @@ -0,0 +1,131 @@ + +"""An implementation of the Morphy lemmatization system for English. + +.. seealso:: + + The Princeton WordNet `documentation + `_ for the + original implementation. + +""" + +from typing import Iterator, Dict, List, Tuple +import warnings + +import wn +from wn.constants import NOUN, VERB, ADJ, ADJ_SAT, ADV + +POSExceptionMap = Dict[str, List[str]] +ExceptionMap = Dict[str, POSExceptionMap] + +POS_LIST = [NOUN, VERB, ADJ, ADJ_SAT, ADV] + +DETACHMENT_RULES: Dict[str, List[Tuple[str, str]]] = { + NOUN: [ + ("s", ""), + ("ces", "x"), # added + ("ses", "s"), + ("ves", "f"), # added + ("ives", "ife"), # added + ("xes", "x"), + ("xes", "xis"), # added + ("zes", "z"), + ("ches", "ch"), + ("shes", "sh"), + ("men", "man"), + ("ies", "y"), + ], + VERB: [ + ("s", ""), + ("ies", "y"), + ("es", "e"), + ("es", ""), + ("ed", "e"), + ("ed", ""), + ("ing", "e"), + ("ing", ""), + ], + ADJ: [ + ("er", ""), + ("est", ""), + ("er", "e"), + ("est", "e"), + ], + ADV: [], +} +DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ] + + +class Morphy: + """The Morphy lemmatizer class. + + Arguments: + wordnet: optional :class:`wn.Wordnet` instance + + Example: + + >>> import wn + >>> from wn.morphy import Morphy + >>> m = Morphy() + >>> list(m('axes')) + ['axes', 'axe', 'ax', 'axis'] + >>> list(m('geese')) + ['geese'] + >>> m = Morphy(wn.Wordnet('ewn:2020')) + >>> list(m('axes')) + ['axes', 'axe', 'ax', 'axis'] + >>> list(m('geese')) + ['geese', 'goose'] + """ + + def __init__(self, wordnet: wn.Wordnet = None): + self._wordnet = wordnet + if wordnet and any(lex.language != 'en' for lex in wordnet.lexicons()): + warnings.warn( + 'Morphy is not intended for use with non-English wordnets', + wn.WnWarning + ) + self._exceptions = _build_exception_map(wordnet) + + def __call__(self, form: str, pos: str = None) -> Iterator[str]: + if pos is None: + poslist = POS_LIST + elif pos not in POS_LIST: + raise wn.Error(f'unsupported or invalid part of speech: {pos}') + else: + poslist = [pos] + + seen = set() + for p in poslist: + forms = _iterforms(form, p, self._exceptions[p]) + # from Python 3.7, the following is simply: + # yield from iter(set(forms)) + for other in forms: + if other not in seen: + seen.add(other) + yield other + + +def _build_exception_map(wordnet: wn.Wordnet = None) -> ExceptionMap: + exceptions: ExceptionMap = {pos: {} for pos in POS_LIST} + if wordnet: + for word in wordnet.words(): + pos_exc = exceptions[word.pos] + lemma, *others = word.forms() + for other in others: + if other in pos_exc: + pos_exc[other].append(lemma) + else: + pos_exc[other] = [lemma] + return exceptions + + +def _iterforms(form: str, pos: str, exceptions: POSExceptionMap) -> Iterator[str]: + yield form + + rules = DETACHMENT_RULES[pos] + yield from iter(exceptions.get(form, [])) + + for suffix, repl in rules: + if form.endswith(suffix): + yield f'{form[:-len(suffix)]}{repl}' From b57454f955b476cdd6acbb5b89d2d748a7bd369d Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 19 Apr 2021 14:31:58 +0800 Subject: [PATCH 10/39] Rename 'normalize' to 'normalizer'; tweak behavior Now setting `normalizer=None` will cause searches to not even consult the `normalized_form` column, and also the form is only normalized if the search with the original form returned no results. --- CHANGELOG.md | 2 +- wn/_core.py | 88 +++++++++++++++++++++++++++++++------------------- wn/_queries.py | 29 +++++++++-------- 3 files changed, 71 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6538c0..ee2b09f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ - The `update()` method now takes a `force` parameter to trigger a refresh regardless of the refresh interval * `wn.Wordnet` - - Initialization now takes a `normalize` parameter ([#105]) + - Initialization now takes a `normalizer` parameter ([#105]) - `Wordnet.words()`, `Wordnet.senses()` and `Wordnet.synsets()` now use a specified normalization function to expand queries on word forms ([#105]) diff --git a/wn/_core.py b/wn/_core.py index d88aafc..c6b195c 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -1,5 +1,7 @@ -from typing import TypeVar, Optional, List, Tuple, Dict, Set, Iterator +from typing import ( + Type, TypeVar, Callable, Optional, List, Tuple, Dict, Set, Iterator +) import warnings import wn @@ -1022,6 +1024,10 @@ def translate(self, lexicon: str = None, *, lang: str = None) -> List['Sense']: for t_sense in t_synset.senses()] +# Useful for factory functions of Word, Sense, or Synset +C = TypeVar('C', Word, Sense, Synset) + + class Wordnet: """Class for interacting with wordnet data. @@ -1041,12 +1047,12 @@ class Wordnet: second space-separated list of lexicon specifiers which are used for traversing relations, but not as the results of queries. - The *normalize* argument takes a function that normalizes word + The *normalizer* argument takes a function that normalizes word forms in order to expand the search. The default function downcases the word and removes diacritics via NFKD_ normalization so that, for example, searching for *san josé* in the English - WordNet will find the entry for *San Jose*. Setting *normalize* to - :python:`None` disables normalization and forces exact-match + WordNet will find the entry for *San Jose*. Setting *normalizer* + to :python:`None` disables normalization and forces exact-match searching. .. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms @@ -1054,7 +1060,7 @@ class Wordnet: """ __slots__ = ('_lexicons', '_lexicon_ids', '_expanded', '_expanded_ids', - '_default_mode', '_normalize') + '_default_mode', '_normalizer') __module__ = 'wn' def __init__( @@ -1063,7 +1069,7 @@ def __init__( *, lang: str = None, expand: str = None, - normalize: Optional[NormalizeFunction] = normalize_form, + normalizer: Optional[NormalizeFunction] = normalize_form, ): # default mode means any lexicon is searched or expanded upon, # but relation traversals only target the source's lexicon @@ -1098,7 +1104,7 @@ def __init__( self._expanded = tuple(map(_to_lexicon, find_lexicons(lexicon=expand))) self._expanded_ids: Tuple[int, ...] = tuple(lx._id for lx in self._expanded) - self._normalize = normalize + self._normalizer = normalizer def lexicons(self): """Return the list of lexicons covered by this wordnet.""" @@ -1125,11 +1131,7 @@ def words(self, form: str = None, pos: str = None) -> List[Word]: restricts words by their part of speech. """ - forms = _expand_form(form, self._normalize) - iterable = find_entries( - forms=forms, pos=pos, lexicon_rowids=self._lexicon_ids - ) - return [Word(*word_data, self) for word_data in iterable] + return self._find_helper(Word, find_entries, form, pos) def synset(self, id: str) -> Synset: """Return the first synset in this wordnet with identifier *id*.""" @@ -1153,11 +1155,7 @@ def synsets( select a unique synset within a single lexicon. """ - forms = _expand_form(form, self._normalize) - iterable = find_synsets( - forms=forms, pos=pos, ili=ili, lexicon_rowids=self._lexicon_ids, - ) - return [Synset(*synset_data, self) for synset_data in iterable] + return self._find_helper(Synset, find_synsets, form, pos, ili=ili) def sense(self, id: str) -> Sense: """Return the first sense in this wordnet with identifier *id*.""" @@ -1176,11 +1174,7 @@ def senses(self, form: str = None, pos: str = None) -> List[Sense]: *pos* restricts senses by their word's part of speech. """ - forms = _expand_form(form, self._normalize) - iterable = find_senses( - forms=forms, pos=pos, lexicon_rowids=self._lexicon_ids - ) - return [Sense(*sense_data, self) for sense_data in iterable] + return self._find_helper(Sense, find_senses, form, pos) def ili(self, id: str) -> ILI: """Return the first ILI in this wordnet with identifer *id*.""" @@ -1194,6 +1188,44 @@ def ilis(self, status: str = None) -> List[ILI]: iterable = find_ilis(status=status, lexicon_rowids=self._lexicon_ids) return [ILI(*ili_data) for ili_data in iterable] + def _find_helper( + self, + cls: Type[C], + query_func: Callable, + form: Optional[str], + pos: Optional[str], + ili: str = None + ) -> List[C]: + """Return the list of matching wordnet entities. + + If the wordnet has a normalizer and the search includes a word + form, the original word form is searched against both the + original and normalized columns in the database. Then, if no + results are found, the search is repeated with the normalized + form. If the wordnet does not have a normalizer, only exact + string matches are used. + + """ + normalize = self._normalizer + forms = [form] if form else None + kwargs = {'pos': pos, 'lexicon_rowids': self._lexicon_ids} + + if ili is not None: + kwargs['ili'] = ili + results = [cls(*data, self) # type: ignore + for data + in query_func(forms=forms, normalized=bool(normalize), **kwargs)] + + if not results and forms and normalize: + normforms = [normalize(f) for f in forms] + results.extend( + cls(*data, self) # type: ignore + for data + in query_func(forms=normforms, normalized=True, **kwargs) + ) + + return results + def _to_lexicon(data) -> Lexicon: rowid, id, label, language, email, license, version, url, citation, logo = data @@ -1211,18 +1243,6 @@ def _to_lexicon(data) -> Lexicon: ) -def _expand_form( - form: Optional[str], - normalize: Optional[NormalizeFunction] -) -> Optional[List[str]]: - if form is None: - return None - forms = [form] - if normalize: - forms.extend([normalize(f) for f in forms]) - return forms - - def projects() -> List[Dict]: """Return the list of indexed projects. diff --git a/wn/_queries.py b/wn/_queries.py index 80dc508..17f02f5 100644 --- a/wn/_queries.py +++ b/wn/_queries.py @@ -280,10 +280,11 @@ def find_proposed_ilis( def find_entries( - id: str = None, - forms: Sequence[str] = None, - pos: str = None, - lexicon_rowids: Sequence[int] = None, + id: str = None, + forms: Sequence[str] = None, + pos: str = None, + lexicon_rowids: Sequence[int] = None, + normalized: bool = False, ) -> Iterator[_Word]: conn = connect() cte = '' @@ -294,12 +295,12 @@ def find_entries( params.append(id) if forms: cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' - conditions.append(''' + or_norm = 'OR normalized_form IN wordforms' if normalized else '' + conditions.append(f''' e.rowid IN (SELECT entry_rowid FROM forms - WHERE form IN wordforms - OR normalized_form IN wordforms) + WHERE form IN wordforms {or_norm}) '''.strip()) params.extend(forms) if pos: @@ -338,6 +339,7 @@ def find_senses( forms: Sequence[str] = None, pos: str = None, lexicon_rowids: Sequence[int] = None, + normalized: bool = False, ) -> Iterator[_Sense]: conn = connect() cte = '' @@ -348,12 +350,12 @@ def find_senses( params.append(id) if forms: cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' - conditions.append(''' + or_norm = 'OR normalized_form IN wordforms' if normalized else '' + conditions.append(f''' s.entry_rowid IN (SELECT entry_rowid FROM forms - WHERE form IN wordforms - OR normalized_form IN wordforms) + WHERE form IN wordforms {or_norm}) '''.strip()) params.extend(forms) if pos: @@ -386,6 +388,7 @@ def find_synsets( pos: str = None, ili: str = None, lexicon_rowids: Sequence[int] = None, + normalized: bool = False, ) -> Iterator[_Synset]: conn = connect() cte = '' @@ -398,12 +401,12 @@ def find_synsets( params.append(id) if forms: cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' - join = '''\ + or_norm = 'OR normalized_form IN wordforms' if normalized else '' + join = f'''\ JOIN (SELECT _s.entry_rowid, _s.synset_rowid, _s.entry_rank FROM forms AS f JOIN senses AS _s ON _s.entry_rowid = f.entry_rowid - WHERE f.form IN wordforms - OR f.normalized_form IN wordforms) AS s + WHERE f.form IN wordforms {or_norm}) AS s ON s.synset_rowid = ss.rowid '''.strip() params.extend(forms) From 4ce4459b61ae5ed649ab3c3903b409747c8ee9fe Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 22 Apr 2021 17:28:38 +0800 Subject: [PATCH 11/39] Fix #8: Support for lemmatizers on Wordnet objects --- CHANGELOG.md | 8 ++- docs/api/wn.morphy.rst | 28 +++++++++ docs/api/wn.rst | 6 ++ wn/__init__.py | 2 + wn/_core.py | 73 ++++++++++++++++++++++- wn/_queries.py | 12 +++- wn/morphy.py | 132 ++++++++++++++++++++++++++--------------- 7 files changed, 204 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee2b09f..6f0eac0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * Support for approximate word searches; on by default, configurable only by instantiating a `wn.Wordnet` object ([#105]) * `wn.morphy` ([#19]) +* `wn.Wordnet.lemmatizer` attribute ([#8]) +* `wn.Lemmatizer` class ([#8]) ### Changed @@ -20,9 +22,10 @@ refresh regardless of the refresh interval * `wn.Wordnet` - Initialization now takes a `normalizer` parameter ([#105]) + - Initialization now takes a `lemmatizer_class` parameter ([#8]) - `Wordnet.words()`, `Wordnet.senses()` and `Wordnet.synsets()` now - use a specified normalization function to expand queries on word - forms ([#105]) + use any specified `wn.Lemmatizer` instance or normalization + function to expand queries on word forms ([#105]) ## [v0.6.2] @@ -347,6 +350,7 @@ abandoned, but this is an entirely new codebase. [unreleased]: ../../tree/main [#7]: https://github.com/goodmami/wn/issues/7 +[#8]: https://github.com/goodmami/wn/issues/8 [#15]: https://github.com/goodmami/wn/issues/15 [#17]: https://github.com/goodmami/wn/issues/17 [#19]: https://github.com/goodmami/wn/issues/19 diff --git a/docs/api/wn.morphy.rst b/docs/api/wn.morphy.rst index cb0371e..f4a3b65 100644 --- a/docs/api/wn.morphy.rst +++ b/docs/api/wn.morphy.rst @@ -4,4 +4,32 @@ wn.morphy .. automodule:: wn.morphy +System Flags +------------ + +The following flags may be passed to the ``system`` parameter of +:class:`Morphy` to adjust the patterns and behaviors it uses. Note +that in order to use these flags, the Morphy instance must be assigned +to the :class:`wn.Wordnet` instances after initialization: + +>>> import wn +>>> from wn import morphy +>>> pwn = wn.Wordnet("pwn:3.0") +>>> pwn.lemmatizer = morphy.Morphy(pwn, system=morphy.NLTK) + +.. data:: PWN + + Use the behavior implemented in the Princeton WordNet. + +.. data:: NLTK + + Use the behavior implemented in the NLTK. + +.. data:: WN + + Use the behavior created for Wn. + +The Morphy Class +---------------- + .. autoclass:: Morphy diff --git a/docs/api/wn.rst b/docs/api/wn.rst index 242c5c1..a11d988 100644 --- a/docs/api/wn.rst +++ b/docs/api/wn.rst @@ -278,6 +278,12 @@ The Lexicon Class .. automethod:: extensions +The Lemmatizer Class +-------------------- + +.. autoclass:: Lemmatizer + + The wn.config Object -------------------- diff --git a/wn/__init__.py b/wn/__init__.py index 2fe67e7..59569dd 100644 --- a/wn/__init__.py +++ b/wn/__init__.py @@ -29,6 +29,7 @@ 'ili', 'ilis', 'ILI', + 'Lemmatizer', 'Error', 'DatabaseError', 'WnWarning', @@ -46,6 +47,7 @@ sense, senses, Sense, Count, synset, synsets, Synset, ili, ilis, ILI, + Lemmatizer, Wordnet ) diff --git a/wn/_core.py b/wn/_core.py index c6b195c..2cbff77 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -1024,11 +1024,48 @@ def translate(self, lexicon: str = None, *, lang: str = None) -> List['Sense']: for t_sense in t_synset.senses()] +class Lemmatizer: + """Class for expanding queries over word forms. + + This class, while intended for morphological lemmatization, is + more broadly construed as a tool for expanding word forms so they + better match the lemmas in a lexicon. This particular class serves + as the default lemmatizer for any wordnet, but users are expected + to use a subclass of this one for any custom behavior, such as + that provided by :class:`wn.morphy.Morphy`. + + The default behavior provided by this class does two things: (1) + yields the original word form unchanged, and (2) declares that + the word form may be searched against non-lemmatic forms in the + lexicon as well. + + Arguments: + wordnet: An instance of a :class:`Wordnet` + Attributes: + search_all_forms: When :python:`False`, word forms are only + searched against lemmas in the database, otherwise they + are searched against all word forms. + + """ + + __slots__ = '_wordnet', + __module__ = 'wn' + + search_all_forms = True + + def __init__(self, wordnet: 'Wordnet'): + self._wordnet = wordnet + + def __call__(self, form: str, pos: str = None) -> Iterator[str]: + yield form + + # Useful for factory functions of Word, Sense, or Synset C = TypeVar('C', Word, Sense, Synset) class Wordnet: + """Class for interacting with wordnet data. A wordnet object acts essentially as a filter by first selecting @@ -1055,12 +1092,29 @@ class Wordnet: to :python:`None` disables normalization and forces exact-match searching. + The *lemmatizer_class* argument may be :python:`None`, which + disables lemmatizer-based query expansion, or a subclass of + :class:`Lemmatizer` which is instantiated with the + :class:`Wordnet` object as its argument. During the lemmatizer's + instantiation, the :attr:`Wordnet.lemmatizer` attribute will be + :python:`None`. That is, the relevant initialization code is + equivalent to: + + .. code-block:: python + + self.lemmatizer = None + if lemmatizer_class: + self.lemmatizer = lemmatizer_class(self) + .. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms + Attributes: + lemmatizer: A :class:`Lemmatizer` instance or :python:`None` + """ __slots__ = ('_lexicons', '_lexicon_ids', '_expanded', '_expanded_ids', - '_default_mode', '_normalizer') + '_default_mode', '_normalizer', 'lemmatizer') __module__ = 'wn' def __init__( @@ -1070,6 +1124,7 @@ def __init__( lang: str = None, expand: str = None, normalizer: Optional[NormalizeFunction] = normalize_form, + lemmatizer_class: Optional[Type[Lemmatizer]] = Lemmatizer, ): # default mode means any lexicon is searched or expanded upon, # but relation traversals only target the source's lexicon @@ -1105,6 +1160,9 @@ def __init__( self._expanded_ids: Tuple[int, ...] = tuple(lx._id for lx in self._expanded) self._normalizer = normalizer + self.lemmatizer = None # needs to be initialized before Lemmatizer + if lemmatizer_class: + self.lemmatizer = lemmatizer_class(self) def lexicons(self): """Return the list of lexicons covered by this wordnet.""" @@ -1207,8 +1265,17 @@ def _find_helper( """ normalize = self._normalizer - forms = [form] if form else None - kwargs = {'pos': pos, 'lexicon_rowids': self._lexicon_ids} + lemmatizer = self.lemmatizer + forms: Optional[List[str]] = None + if form: + forms = list(lemmatizer(form, pos)) if lemmatizer else [form] + if not forms: + return [] # lemmatizer found nothing + kwargs = { + 'pos': pos, + 'lexicon_rowids': self._lexicon_ids, + 'search_all_forms': getattr(lemmatizer, 'search_all_forms', False), + } if ili is not None: kwargs['ili'] = ili diff --git a/wn/_queries.py b/wn/_queries.py index 17f02f5..87a8d3c 100644 --- a/wn/_queries.py +++ b/wn/_queries.py @@ -285,6 +285,7 @@ def find_entries( pos: str = None, lexicon_rowids: Sequence[int] = None, normalized: bool = False, + search_all_forms: bool = False, ) -> Iterator[_Word]: conn = connect() cte = '' @@ -296,11 +297,12 @@ def find_entries( if forms: cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' or_norm = 'OR normalized_form IN wordforms' if normalized else '' + and_rank = '' if search_all_forms else 'AND rank = 0' conditions.append(f''' e.rowid IN (SELECT entry_rowid FROM forms - WHERE form IN wordforms {or_norm}) + WHERE (form IN wordforms {or_norm}) {and_rank}) '''.strip()) params.extend(forms) if pos: @@ -340,6 +342,7 @@ def find_senses( pos: str = None, lexicon_rowids: Sequence[int] = None, normalized: bool = False, + search_all_forms: bool = False, ) -> Iterator[_Sense]: conn = connect() cte = '' @@ -351,11 +354,12 @@ def find_senses( if forms: cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' or_norm = 'OR normalized_form IN wordforms' if normalized else '' + and_rank = '' if search_all_forms else 'AND rank = 0' conditions.append(f''' s.entry_rowid IN (SELECT entry_rowid FROM forms - WHERE form IN wordforms {or_norm}) + WHERE (form IN wordforms {or_norm}) {and_rank}) '''.strip()) params.extend(forms) if pos: @@ -389,6 +393,7 @@ def find_synsets( ili: str = None, lexicon_rowids: Sequence[int] = None, normalized: bool = False, + search_all_forms: bool = False, ) -> Iterator[_Synset]: conn = connect() cte = '' @@ -402,11 +407,12 @@ def find_synsets( if forms: cte = f'WITH wordforms(s) AS (VALUES {_vs(forms)})' or_norm = 'OR normalized_form IN wordforms' if normalized else '' + and_rank = '' if search_all_forms else 'AND rank = 0' join = f'''\ JOIN (SELECT _s.entry_rowid, _s.synset_rowid, _s.entry_rank FROM forms AS f JOIN senses AS _s ON _s.entry_rowid = f.entry_rowid - WHERE f.form IN wordforms {or_norm}) AS s + WHERE (f.form IN wordforms {or_norm}) {and_rank}) AS s ON s.synset_rowid = ss.rowid '''.strip() params.extend(forms) diff --git a/wn/morphy.py b/wn/morphy.py index 6db4d79..20bd3bd 100644 --- a/wn/morphy.py +++ b/wn/morphy.py @@ -9,54 +9,76 @@ """ -from typing import Iterator, Dict, List, Tuple +from typing import Iterator, Dict, Set, List, Tuple import warnings +from enum import Flag, auto import wn from wn.constants import NOUN, VERB, ADJ, ADJ_SAT, ADV -POSExceptionMap = Dict[str, List[str]] +POSExceptionMap = Dict[str, Set[str]] ExceptionMap = Dict[str, POSExceptionMap] POS_LIST = [NOUN, VERB, ADJ, ADJ_SAT, ADV] +ALL_LEMMAS = '' # assumption: no alternative form should be the empty string -DETACHMENT_RULES: Dict[str, List[Tuple[str, str]]] = { + +class System(Flag): + """Flags to track suffix rules in various implementations of Morphy. + + These are available at the module level, as well (e.g., `morphy.PWN`). + """ + PWN = auto() + NLTK = auto() + WN = auto() + ALL = PWN | NLTK | WN + + +PWN = System.PWN +NLTK = System.NLTK +WN = System.WN +_ALL = System.ALL + + +Rule = Tuple[str, str, System] + +DETACHMENT_RULES: Dict[str, List[Rule]] = { NOUN: [ - ("s", ""), - ("ces", "x"), # added - ("ses", "s"), - ("ves", "f"), # added - ("ives", "ife"), # added - ("xes", "x"), - ("xes", "xis"), # added - ("zes", "z"), - ("ches", "ch"), - ("shes", "sh"), - ("men", "man"), - ("ies", "y"), + ("s", "", _ALL), + ("ces", "x", WN), + ("ses", "s", _ALL), + ("ves", "f", NLTK | WN), + ("ives", "ife", WN), + ("xes", "x", _ALL), + ("xes", "xis", WN), + ("zes", "z", _ALL), + ("ches", "ch", _ALL), + ("shes", "sh", _ALL), + ("men", "man", _ALL), + ("ies", "y", _ALL), ], VERB: [ - ("s", ""), - ("ies", "y"), - ("es", "e"), - ("es", ""), - ("ed", "e"), - ("ed", ""), - ("ing", "e"), - ("ing", ""), + ("s", "", _ALL), + ("ies", "y", _ALL), + ("es", "e", _ALL), + ("es", "", _ALL), + ("ed", "e", _ALL), + ("ed", "", _ALL), + ("ing", "e", _ALL), + ("ing", "", _ALL), ], ADJ: [ - ("er", ""), - ("est", ""), - ("er", "e"), - ("est", "e"), + ("er", "", _ALL), + ("est", "", _ALL), + ("er", "e", _ALL), + ("est", "e", _ALL), ], ADV: [], } DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ] -class Morphy: +class Morphy(wn.Lemmatizer): """The Morphy lemmatizer class. Arguments: @@ -66,25 +88,26 @@ class Morphy: >>> import wn >>> from wn.morphy import Morphy - >>> m = Morphy() - >>> list(m('axes')) - ['axes', 'axe', 'ax', 'axis'] - >>> list(m('geese')) - ['geese'] - >>> m = Morphy(wn.Wordnet('ewn:2020')) + >>> ewn = wn.Wordnet('ewn:2020') + >>> m = Morphy(ewn) >>> list(m('axes')) - ['axes', 'axe', 'ax', 'axis'] + ['axe', 'ax', 'axis'] >>> list(m('geese')) - ['geese', 'goose'] + ['goose'] """ - def __init__(self, wordnet: wn.Wordnet = None): - self._wordnet = wordnet - if wordnet and any(lex.language != 'en' for lex in wordnet.lexicons()): + search_all_forms = False + + def __init__(self, wordnet: wn.Wordnet, system: System = WN): + if any(lex.language != 'en' for lex in wordnet.lexicons()): warnings.warn( 'Morphy is not intended for use with non-English wordnets', wn.WnWarning ) + self._wordnet = wordnet + self._system = system + self._rules = {pos: [rule for rule in rules if rule[2] & system] + for pos, rules in DETACHMENT_RULES.items()} self._exceptions = _build_exception_map(wordnet) def __call__(self, form: str, pos: str = None) -> Iterator[str]: @@ -97,7 +120,7 @@ def __call__(self, form: str, pos: str = None) -> Iterator[str]: seen = set() for p in poslist: - forms = _iterforms(form, p, self._exceptions[p]) + forms = _iterforms(form, self._rules[p], self._exceptions[p]) # from Python 3.7, the following is simply: # yield from iter(set(forms)) for other in forms: @@ -106,26 +129,37 @@ def __call__(self, form: str, pos: str = None) -> Iterator[str]: yield other -def _build_exception_map(wordnet: wn.Wordnet = None) -> ExceptionMap: - exceptions: ExceptionMap = {pos: {} for pos in POS_LIST} +def _build_exception_map(wordnet: wn.Wordnet) -> ExceptionMap: + exceptions: ExceptionMap = {pos: {ALL_LEMMAS: set()} for pos in POS_LIST} if wordnet: for word in wordnet.words(): pos_exc = exceptions[word.pos] lemma, *others = word.forms() + # store every lemma whether it has other forms or not + pos_exc[ALL_LEMMAS].add(lemma) + # those with other forms map to the original lemmas for other in others: if other in pos_exc: - pos_exc[other].append(lemma) + pos_exc[other].add(lemma) else: - pos_exc[other] = [lemma] + pos_exc[other] = {lemma} return exceptions -def _iterforms(form: str, pos: str, exceptions: POSExceptionMap) -> Iterator[str]: - yield form +def _iterforms( + form: str, + rules: List[Rule], + exceptions: POSExceptionMap +) -> Iterator[str]: + pos_lemmas = exceptions[ALL_LEMMAS] + + if form in pos_lemmas: + yield form - rules = DETACHMENT_RULES[pos] yield from iter(exceptions.get(form, [])) - for suffix, repl in rules: + for suffix, repl, _ in rules: if form.endswith(suffix): - yield f'{form[:-len(suffix)]}{repl}' + _form = f'{form[:-len(suffix)]}{repl}' + if _form in pos_lemmas: + yield _form From eec1ff6357c3da3e7ba0657af4b0da81a5852985 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Fri, 23 Apr 2021 10:36:30 +0800 Subject: [PATCH 12/39] Avoid lemmatizing with full suppletion (ies -> y) --- wn/morphy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wn/morphy.py b/wn/morphy.py index 20bd3bd..ad34c5b 100644 --- a/wn/morphy.py +++ b/wn/morphy.py @@ -159,7 +159,8 @@ def _iterforms( yield from iter(exceptions.get(form, [])) for suffix, repl, _ in rules: - if form.endswith(suffix): + # avoid applying rules that perform full suppletion + if form.endswith(suffix) and len(suffix) < len(form): _form = f'{form[:-len(suffix)]}{repl}' if _form in pos_lemmas: yield _form From e2de18e696dc9a6e57930c52f36ed70f2db1ce34 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 27 Apr 2021 15:42:20 +0800 Subject: [PATCH 13/39] Tighten up wn.Lemmatizer and Morphy a bit --- wn/_core.py | 4 +- wn/morphy.py | 106 +++++++++++++++++++++++++-------------------------- 2 files changed, 54 insertions(+), 56 deletions(-) diff --git a/wn/_core.py b/wn/_core.py index 2cbff77..db1f2f7 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -6,6 +6,7 @@ import wn from wn._types import Metadata, NormalizeFunction +from wn.constants import PARTS_OF_SPEECH from wn._util import flatten, normalize_form from wn._db import NON_ROWID from wn._queries import ( @@ -1052,11 +1053,12 @@ class Lemmatizer: __module__ = 'wn' search_all_forms = True + parts_of_speech: Collection[str] = PARTS_OF_SPEECH def __init__(self, wordnet: 'Wordnet'): self._wordnet = wordnet - def __call__(self, form: str, pos: str = None) -> Iterator[str]: + def __call__(self, form: str, pos: str) -> Iterator[str]: yield form diff --git a/wn/morphy.py b/wn/morphy.py index ad34c5b..c5052bb 100644 --- a/wn/morphy.py +++ b/wn/morphy.py @@ -19,9 +19,6 @@ POSExceptionMap = Dict[str, Set[str]] ExceptionMap = Dict[str, POSExceptionMap] -POS_LIST = [NOUN, VERB, ADJ, ADJ_SAT, ADV] -ALL_LEMMAS = '' # assumption: no alternative form should be the empty string - class System(Flag): """Flags to track suffix rules in various implementations of Morphy. @@ -87,16 +84,18 @@ class Morphy(wn.Lemmatizer): Example: >>> import wn + >>> from wn.constants import NOUN >>> from wn.morphy import Morphy >>> ewn = wn.Wordnet('ewn:2020') >>> m = Morphy(ewn) - >>> list(m('axes')) + >>> list(m('axes', NOUN)) ['axe', 'ax', 'axis'] - >>> list(m('geese')) + >>> list(m('geese', NOUN)) ['goose'] """ search_all_forms = False + parts_of_speech = {NOUN, VERB, ADJ, ADJ_SAT, ADV} def __init__(self, wordnet: wn.Wordnet, system: System = WN): if any(lex.language != 'en' for lex in wordnet.lexicons()): @@ -106,61 +105,58 @@ def __init__(self, wordnet: wn.Wordnet, system: System = WN): ) self._wordnet = wordnet self._system = system - self._rules = {pos: [rule for rule in rules if rule[2] & system] - for pos, rules in DETACHMENT_RULES.items()} - self._exceptions = _build_exception_map(wordnet) - - def __call__(self, form: str, pos: str = None) -> Iterator[str]: - if pos is None: - poslist = POS_LIST - elif pos not in POS_LIST: - raise wn.Error(f'unsupported or invalid part of speech: {pos}') - else: - poslist = [pos] - - seen = set() - for p in poslist: - forms = _iterforms(form, self._rules[p], self._exceptions[p]) - # from Python 3.7, the following is simply: - # yield from iter(set(forms)) - for other in forms: - if other not in seen: - seen.add(other) - yield other - - -def _build_exception_map(wordnet: wn.Wordnet) -> ExceptionMap: - exceptions: ExceptionMap = {pos: {ALL_LEMMAS: set()} for pos in POS_LIST} - if wordnet: - for word in wordnet.words(): - pos_exc = exceptions[word.pos] + self._rules = { + pos: [rule for rule in rules if rule[2] & system] + for pos, rules in DETACHMENT_RULES.items() + } + self._exceptions: ExceptionMap = { + pos: {} for pos in self.parts_of_speech + } + self._all_lemmas: Dict[str, Set[str]] = { + pos: set() for pos in self.parts_of_speech + } + self._build() + + def __call__(self, form: str, pos: str) -> Iterator[str]: + if pos not in self.parts_of_speech: + return + + exceptions = self._exceptions[pos] + rules = self._rules[pos] + pos_lemmas = self._all_lemmas[pos] + + # original lemma + if form in pos_lemmas: + yield form + + seen = set() # don't yield the same form more than once per pos + + # lemmas from exceptions + for _form in exceptions.get(form, []): + seen.add(_form) + yield _form + + # lemmas from morphological detachment + for suffix, repl, _ in rules: + # avoid applying rules that perform full suppletion + if form.endswith(suffix) and len(suffix) < len(form): + _form = f'{form[:-len(suffix)]}{repl}' + if _form in pos_lemmas and _form not in seen: + seen.add(_form) + yield _form + + def _build(self) -> None: + exceptions = self._exceptions + all_lemmas = self._all_lemmas + for word in self._wordnet.words(): + pos = word.pos + pos_exc = exceptions[pos] lemma, *others = word.forms() # store every lemma whether it has other forms or not - pos_exc[ALL_LEMMAS].add(lemma) + all_lemmas[pos].add(lemma) # those with other forms map to the original lemmas for other in others: if other in pos_exc: pos_exc[other].add(lemma) else: pos_exc[other] = {lemma} - return exceptions - - -def _iterforms( - form: str, - rules: List[Rule], - exceptions: POSExceptionMap -) -> Iterator[str]: - pos_lemmas = exceptions[ALL_LEMMAS] - - if form in pos_lemmas: - yield form - - yield from iter(exceptions.get(form, [])) - - for suffix, repl, _ in rules: - # avoid applying rules that perform full suppletion - if form.endswith(suffix) and len(suffix) < len(form): - _form = f'{form[:-len(suffix)]}{repl}' - if _form in pos_lemmas: - yield _form From 35f2cee4fda5d68402dbc4a3c6b9ed880ec256ba Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 27 Apr 2021 15:44:29 +0800 Subject: [PATCH 14/39] Add LemmatizerInstance type; tidy up Wordnet class --- wn/_core.py | 143 ++++++++++++++++++++++++++++++++------------------- wn/_types.py | 6 ++- 2 files changed, 95 insertions(+), 54 deletions(-) diff --git a/wn/_core.py b/wn/_core.py index db1f2f7..7f90383 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -1,11 +1,11 @@ from typing import ( - Type, TypeVar, Callable, Optional, List, Tuple, Dict, Set, Iterator + Type, TypeVar, Callable, Optional, List, Tuple, Dict, Set, Iterator, Collection ) import warnings import wn -from wn._types import Metadata, NormalizeFunction +from wn._types import Metadata, NormalizeFunction, LemmatizerInstance from wn.constants import PARTS_OF_SPEECH from wn._util import flatten, normalize_form from wn._db import NON_ROWID @@ -1084,7 +1084,9 @@ class Wordnet: wordnet, namely the Princeton WordNet, and then relying on the larger wordnet for structural relations. An *expand* argument is a second space-separated list of lexicon specifiers which are used - for traversing relations, but not as the results of queries. + for traversing relations, but not as the results of + queries. Setting *expand* to an empty string (:python:`expand=''`) + disables expand lexicons. The *normalizer* argument takes a function that normalizes word forms in order to expand the search. The default function @@ -1162,7 +1164,8 @@ def __init__( self._expanded_ids: Tuple[int, ...] = tuple(lx._id for lx in self._expanded) self._normalizer = normalizer - self.lemmatizer = None # needs to be initialized before Lemmatizer + + self.lemmatizer: Optional[LemmatizerInstance] = None if lemmatizer_class: self.lemmatizer = lemmatizer_class(self) @@ -1191,7 +1194,7 @@ def words(self, form: str = None, pos: str = None) -> List[Word]: restricts words by their part of speech. """ - return self._find_helper(Word, find_entries, form, pos) + return _find_helper(self, Word, find_entries, form, pos) def synset(self, id: str) -> Synset: """Return the first synset in this wordnet with identifier *id*.""" @@ -1215,7 +1218,7 @@ def synsets( select a unique synset within a single lexicon. """ - return self._find_helper(Synset, find_synsets, form, pos, ili=ili) + return _find_helper(self, Synset, find_synsets, form, pos, ili=ili) def sense(self, id: str) -> Sense: """Return the first sense in this wordnet with identifier *id*.""" @@ -1234,7 +1237,7 @@ def senses(self, form: str = None, pos: str = None) -> List[Sense]: *pos* restricts senses by their word's part of speech. """ - return self._find_helper(Sense, find_senses, form, pos) + return _find_helper(self, Sense, find_senses, form, pos) def ili(self, id: str) -> ILI: """Return the first ILI in this wordnet with identifer *id*.""" @@ -1245,55 +1248,13 @@ def ili(self, id: str) -> ILI: raise wn.Error(f'no such ILI: {id}') def ilis(self, status: str = None) -> List[ILI]: - iterable = find_ilis(status=status, lexicon_rowids=self._lexicon_ids) - return [ILI(*ili_data) for ili_data in iterable] + """Return the list of ILIs in this wordnet. - def _find_helper( - self, - cls: Type[C], - query_func: Callable, - form: Optional[str], - pos: Optional[str], - ili: str = None - ) -> List[C]: - """Return the list of matching wordnet entities. - - If the wordnet has a normalizer and the search includes a word - form, the original word form is searched against both the - original and normalized columns in the database. Then, if no - results are found, the search is repeated with the normalized - form. If the wordnet does not have a normalizer, only exact - string matches are used. + If *status* is given, only return ILIs with a matching status. """ - normalize = self._normalizer - lemmatizer = self.lemmatizer - forms: Optional[List[str]] = None - if form: - forms = list(lemmatizer(form, pos)) if lemmatizer else [form] - if not forms: - return [] # lemmatizer found nothing - kwargs = { - 'pos': pos, - 'lexicon_rowids': self._lexicon_ids, - 'search_all_forms': getattr(lemmatizer, 'search_all_forms', False), - } - - if ili is not None: - kwargs['ili'] = ili - results = [cls(*data, self) # type: ignore - for data - in query_func(forms=forms, normalized=bool(normalize), **kwargs)] - - if not results and forms and normalize: - normforms = [normalize(f) for f in forms] - results.extend( - cls(*data, self) # type: ignore - for data - in query_func(forms=normforms, normalized=True, **kwargs) - ) - - return results + iterable = find_ilis(status=status, lexicon_rowids=self._lexicon_ids) + return [ILI(*ili_data) for ili_data in iterable] def _to_lexicon(data) -> Lexicon: @@ -1312,6 +1273,82 @@ def _to_lexicon(data) -> Lexicon: ) +def _find_helper( + w: Wordnet, + cls: Type[C], + query_func: Callable, + form: Optional[str], + pos: Optional[str], + ili: str = None +) -> List[C]: + """Return the list of matching wordnet entities. + + If the wordnet has a normalizer and the search includes a word + form, the original word form is searched against both the + original and normalized columns in the database. Then, if no + results are found, the search is repeated with the normalized + form. If the wordnet does not have a normalizer, only exact + string matches are used. + + """ + kwargs: Dict = { + 'lexicon_rowids': w._lexicon_ids, + } + if ili is not None: + kwargs['ili'] = ili + + # easy case is when there is no form + if form is None: + return [cls(*data, w) # type: ignore + for data in query_func(pos=pos, **kwargs)] + + # if there's a form, we may need to lemmatize and normalize + lemmatizer = w.lemmatizer + normalize = w._normalizer + kwargs['search_all_forms'] = getattr(lemmatizer, 'search_all_forms', False) + kwargs['normalized'] = bool(normalize) + + forms = _get_forms(form, pos, lemmatizer) + + results = [ + cls(*data, w) # type: ignore + for _pos, _forms in forms.items() + for data in query_func(forms=_forms, pos=_pos, **kwargs) + ] + if not results and normalize: + results = [ + cls(*data, w) # type: ignore + for _pos, _forms in forms.items() + for data in query_func( + forms=[normalize(f) for f in _forms], pos=_pos, **kwargs + ) + ] + return results + + +def _get_forms( + form: str, + pos: Optional[str], + lemmatizer: Optional[LemmatizerInstance] +) -> Dict[Optional[str], List[str]]: + # special case for default Lemmatizer class + if not lemmatizer or lemmatizer.__class__ is Lemmatizer: + return {pos: [form]} + + pos_set: Set[str] + if pos is None: + pos_set = getattr(lemmatizer, 'parts_of_speech', PARTS_OF_SPEECH) + else: + pos_set = {pos} + + forms: Dict[Optional[str], List[str]] = {} + for _pos in pos_set: + _forms = list(lemmatizer(form, _pos)) + if _forms: + forms[_pos] = _forms + return forms + + def projects() -> List[Dict]: """Return the list of indexed projects. diff --git a/wn/_types.py b/wn/_types.py index a2709f6..c427aa4 100644 --- a/wn/_types.py +++ b/wn/_types.py @@ -1,5 +1,5 @@ -from typing import Union, Callable, Mapping, Sequence, Dict, Any +from typing import Union, Callable, Mapping, Sequence, Dict, Any, Iterator from pathlib import Path # For functions taking a filesystem path as a str or a pathlib.Path @@ -13,3 +13,7 @@ # A function that returns a normalized word form for a given word form NormalizeFunction = Callable[[str], str] + +# A callable class or function that yields lemmatized words for a +# given word form and part of speech +LemmatizerInstance = Callable[[str, str], Iterator[str]] From 0e759132b9f31b5b46392c9bffbb23dba8b84fa6 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 27 Apr 2021 15:45:17 +0800 Subject: [PATCH 15/39] Add tests for normalizers, lemmatizers, and Morphy --- tests/data/mini-lmf-1.0.xml | 1 + tests/morphy_test.py | 17 +++++++++ tests/wordnet_test.py | 72 +++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 tests/morphy_test.py create mode 100644 tests/wordnet_test.py diff --git a/tests/data/mini-lmf-1.0.xml b/tests/data/mini-lmf-1.0.xml index e54564a..a29da4e 100644 --- a/tests/data/mini-lmf-1.0.xml +++ b/tests/data/mini-lmf-1.0.xml @@ -81,6 +81,7 @@ Spanish: +
diff --git a/tests/morphy_test.py b/tests/morphy_test.py new file mode 100644 index 0000000..5c29f17 --- /dev/null +++ b/tests/morphy_test.py @@ -0,0 +1,17 @@ + +import pytest + +import wn +from wn.morphy import Morphy + + +@pytest.mark.usefixtures('mini_db') +def test_morphy(): + w = wn.Wordnet('test-en:1', lemmatizer_class=Morphy) + m = w.lemmatizer + assert list(m('examples', 'n')) == ['example'] + assert list(m('examples', 'v')) == [] + assert list(m('exemplifying', 'n')) == [] + assert list(m('exemplifying', 'v')) == ['exemplify'] + assert list(m('data', 'n')) == ['datum'] + assert list(m('datums', 'n')) == ['datum'] # expected false positive diff --git a/tests/wordnet_test.py b/tests/wordnet_test.py new file mode 100644 index 0000000..2102c57 --- /dev/null +++ b/tests/wordnet_test.py @@ -0,0 +1,72 @@ + +import pytest + +import wn + + +@pytest.mark.usefixtures('mini_db_1_1') +def test_wordnet_lexicons(): + en = wn.Wordnet('test-en') + assert len(en.lexicons()) == 1 + assert len(en.expanded_lexicons()) == 0 + + en1 = wn.Wordnet('test-en:1') + assert en.lexicons() == en1.lexicons() + assert en.expanded_lexicons() == en1.expanded_lexicons() + + en2 = wn.Wordnet(lang='en') + assert len(en2.lexicons()) == 2 + assert len(en2.expanded_lexicons()) == 0 + + es = wn.Wordnet('test-es') + assert len(es.lexicons()) == 1 + assert len(es.expanded_lexicons()) == 0 + + es2 = wn.Wordnet('test-es', expand='test-en') + assert len(es2.lexicons()) == 1 + assert len(es2.expanded_lexicons()) == 1 + + ja = wn.Wordnet('test-ja') + assert len(ja.lexicons()) == 1 + assert len(ja.expanded_lexicons()) == 1 + + ja2 = wn.Wordnet('test-ja', expand='') + assert len(ja2.lexicons()) == 1 + assert len(ja2.expanded_lexicons()) == 0 + + +@pytest.mark.usefixtures('mini_db') +def test_wordnet_normalize(): + es = wn.Wordnet('test-es') + assert es.words('Informacion') == es.words('información') + assert es.words('ínfórmácíón') == es.words('información') + es = wn.Wordnet('test-es', normalizer=None) + assert es.words('informacion') == [] + assert es.words('Información') == [] + + # The following doesn't necessarily work because any non-None + # normalizer causes the normalized form column to be tested with + # the original form + # es = wn.Wordnet('test-es', normalizer=str.lower) + # assert es.words('informacion') == [] + # assert es.words('Información') == es.words('información') + + +@pytest.mark.usefixtures('mini_db') +def test_wordnet_lemmatize(): + # default lemmatizer compares alternative forms + en = wn.Wordnet('test-en') + assert en.words('examples') == [] + assert en.words('exemplifying') == en.words('exemplify') + assert en.words('data') == en.words('datum') + + en = wn.Wordnet('test-en', lemmatizer_class=None) + assert en.words('examples') == [] + assert en.words('exemplifying') == [] + assert en.words('data') == [] + + en = wn.Wordnet('test-en') + en.lemmatizer = None + assert en.words('examples') == [] + assert en.words('exemplifying') == [] + assert en.words('data') == [] From 30ec401a1f8187623175bf1252fc8ea5d1e34c42 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 28 Apr 2021 10:15:01 +0800 Subject: [PATCH 16/39] Rename lemmatizer_class param to lemmatizer Also allow it to take Lemmatizer instances or similar callables. --- CHANGELOG.md | 2 +- tests/morphy_test.py | 2 +- tests/wordnet_test.py | 15 ++++++++++++- wn/_core.py | 51 +++++++++++++++++++++++++++++-------------- 4 files changed, 51 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f0eac0..b56d4c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ refresh regardless of the refresh interval * `wn.Wordnet` - Initialization now takes a `normalizer` parameter ([#105]) - - Initialization now takes a `lemmatizer_class` parameter ([#8]) + - Initialization now takes a `lemmatizer` parameter ([#8]) - `Wordnet.words()`, `Wordnet.senses()` and `Wordnet.synsets()` now use any specified `wn.Lemmatizer` instance or normalization function to expand queries on word forms ([#105]) diff --git a/tests/morphy_test.py b/tests/morphy_test.py index 5c29f17..80b3c8e 100644 --- a/tests/morphy_test.py +++ b/tests/morphy_test.py @@ -7,7 +7,7 @@ @pytest.mark.usefixtures('mini_db') def test_morphy(): - w = wn.Wordnet('test-en:1', lemmatizer_class=Morphy) + w = wn.Wordnet('test-en:1', lemmatizer=Morphy) m = w.lemmatizer assert list(m('examples', 'n')) == ['example'] assert list(m('examples', 'v')) == [] diff --git a/tests/wordnet_test.py b/tests/wordnet_test.py index 2102c57..6889672 100644 --- a/tests/wordnet_test.py +++ b/tests/wordnet_test.py @@ -60,7 +60,7 @@ def test_wordnet_lemmatize(): assert en.words('exemplifying') == en.words('exemplify') assert en.words('data') == en.words('datum') - en = wn.Wordnet('test-en', lemmatizer_class=None) + en = wn.Wordnet('test-en', lemmatizer=None) assert en.words('examples') == [] assert en.words('exemplifying') == [] assert en.words('data') == [] @@ -70,3 +70,16 @@ def test_wordnet_lemmatize(): assert en.words('examples') == [] assert en.words('exemplifying') == [] assert en.words('data') == [] + + def morphy_lite(form, pos): + yield form + if pos == 'n' and form.endswith('s'): + yield form[:-1] + + en = wn.Wordnet('test-en', lemmatizer=morphy_lite) + assert en.words('examples') == en.words('example') + assert en.words('exemplifying') == [] + assert en.words('data') == [] + + morphy_lite.search_all_forms = True + assert en.words('data') == en.words('datum') diff --git a/wn/_core.py b/wn/_core.py index 7f90383..8ee0997 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -1,6 +1,16 @@ from typing import ( - Type, TypeVar, Callable, Optional, List, Tuple, Dict, Set, Iterator, Collection + Union, + Type, + TypeVar, + Callable, + Optional, + List, + Tuple, + Dict, + Set, + Iterator, + Collection, ) import warnings @@ -1035,6 +1045,12 @@ class Lemmatizer: to use a subclass of this one for any custom behavior, such as that provided by :class:`wn.morphy.Morphy`. + Instances of this class and subclasses are callables with a + signature equivalent to the following: + + def lemmatize(form: str, pos: str) -> Iterator[str]: + ... + The default behavior provided by this class does two things: (1) yields the original word form unchanged, and (2) declares that the word form may be searched against non-lemmatic forms in the @@ -1046,6 +1062,8 @@ class Lemmatizer: search_all_forms: When :python:`False`, word forms are only searched against lemmas in the database, otherwise they are searched against all word forms. + parts_of_speech: A collection of the parts of speech relevant + for the lemmatizer. """ @@ -1096,19 +1114,13 @@ class Wordnet: to :python:`None` disables normalization and forces exact-match searching. - The *lemmatizer_class* argument may be :python:`None`, which - disables lemmatizer-based query expansion, or a subclass of - :class:`Lemmatizer` which is instantiated with the - :class:`Wordnet` object as its argument. During the lemmatizer's + The *lemmatizer* argument may be :python:`None`, which disables + lemmatizer-based query expansion, a subclass of + :class:`Lemmatizer` which will be instantiated with the + :class:`Wordnet` object as its argument (during this instantiation, the :attr:`Wordnet.lemmatizer` attribute will be - :python:`None`. That is, the relevant initialization code is - equivalent to: - - .. code-block:: python - - self.lemmatizer = None - if lemmatizer_class: - self.lemmatizer = lemmatizer_class(self) + :python:`None`), or a callable matching the call signature of a + :class:`Lemmatizer` instance. .. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms @@ -1128,7 +1140,9 @@ def __init__( lang: str = None, expand: str = None, normalizer: Optional[NormalizeFunction] = normalize_form, - lemmatizer_class: Optional[Type[Lemmatizer]] = Lemmatizer, + lemmatizer: Optional[ + Union[Type[Lemmatizer], LemmatizerInstance] + ] = Lemmatizer, ): # default mode means any lexicon is searched or expanded upon, # but relation traversals only target the source's lexicon @@ -1165,9 +1179,14 @@ def __init__( self._normalizer = normalizer + # setting the lemmatizer should be done last as its + # instantiation may use the Wordnet object self.lemmatizer: Optional[LemmatizerInstance] = None - if lemmatizer_class: - self.lemmatizer = lemmatizer_class(self) + if lemmatizer: + if isinstance(lemmatizer, type): + self.lemmatizer = lemmatizer(self) + else: + self.lemmatizer = lemmatizer def lexicons(self): """Return the list of lexicons covered by this wordnet.""" From 1c56c2eb8d75880934d6c4a818f7c15f770a9c11 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 28 Apr 2021 10:15:45 +0800 Subject: [PATCH 17/39] Back off failed lemmatization to original form This is to account for the (currently hypothetical) case where a user searches for a word with a part of speech not covered by the lemmatizer, but where the word exists in the database. For example, the Morphy lemmatizer only covers nouns, verbs, adjectives, and adverbs, so if an English wordnet ever includes other parts of speech (prepositions, conjunctions, phrases, etc.) it would yield nothing and queries would therefore return nothing. Now, if the lemmatizer yields nothing, the original form and part of speech are used. --- wn/_core.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/wn/_core.py b/wn/_core.py index 8ee0997..d7efd3b 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -1365,6 +1365,12 @@ def _get_forms( _forms = list(lemmatizer(form, _pos)) if _forms: forms[_pos] = _forms + + # if the lemmatizer cannot find anything, just return the original + # word and pos as a last resort + if not forms: + forms[pos] = [form] + return forms From c4e7e3712f488bf628e12a94130665ce1148b01c Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 10 May 2021 11:06:14 +0800 Subject: [PATCH 18/39] Fix #117: use correct parameter name in Synset.ili --- CHANGELOG.md | 5 +++++ tests/secondary_query_test.py | 10 ++++++++++ wn/_core.py | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b56d4c9..cfa5f2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,10 @@ use any specified `wn.Lemmatizer` instance or normalization function to expand queries on word forms ([#105]) +### Fixed + +* `wn.Synset.ili` for proposed ILIs now works again (#117) + ## [v0.6.2] @@ -390,3 +394,4 @@ abandoned, but this is an entirely new codebase. [#105]: https://github.com/goodmami/wn/issues/105 [#106]: https://github.com/goodmami/wn/issues/106 [#108]: https://github.com/goodmami/wn/issues/108 +[#117]: https://github.com/goodmami/wn/issues/117 diff --git a/tests/secondary_query_test.py b/tests/secondary_query_test.py index 39ae397..9ac9e97 100644 --- a/tests/secondary_query_test.py +++ b/tests/secondary_query_test.py @@ -83,6 +83,16 @@ def test_synset_lemmas(): assert wn.synset('test-es-0003-v').lemmas() == ['ejemplificar', 'ilustrar'] +@pytest.mark.usefixtures('mini_db') +def test_synset_ili(): + assert isinstance(wn.synset('test-en-0001-n').ili, wn.ILI) + assert wn.synset('test-en-0001-n').ili.id == 'i67447' + assert wn.synset('test-en-0001-n').ili.status == 'presupposed' + assert wn.synset('test-en-0005-n-fake').ili is None + assert wn.synset('test-en-0007-v').ili.id is None + assert wn.synset('test-en-0007-v').ili.status == 'proposed' + + @pytest.mark.usefixtures('mini_db') def test_synset_definition(): assert wn.synset('test-en-0001-n').definition() == 'something that informs' diff --git a/wn/_core.py b/wn/_core.py index d7efd3b..45a0a88 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -508,7 +508,7 @@ def ili(self): if self._ili: row = next(find_ilis(id=self._ili), None) else: - row = next(find_proposed_ilis(synset_id=self._id), None) + row = next(find_proposed_ilis(synset_rowid=self._id), None) if row is not None: return ILI(*row) return None From b6d231a7d3b702720ad9cd0e2a7bcf2f5bd87e81 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 11 May 2021 13:39:49 +0800 Subject: [PATCH 19/39] Simplify wn.morphy and lemmatization I'm revising the plan for lemmatization such that the Wordnet class takes only a callable for the `lemmatizer` parameter and doesn't perform any instantiation of the lemmatizer. The `Lemmatizer` class is therefore unnecessary and is removed. Morphy now has two modes: uninitialized and initialized. Initialized is as before, but uninitialized only performs the de-suffixing and does not validate the lemmas. As such, it is more suitable for applications requiring fast startup, such as web servers. Also, lemmatizer functions now return a mapping of parts of speech to sets of lemmas instead of yielding lemmas only. This allows it to take `None` for a part of speech and do something reasonable. --- CHANGELOG.md | 5 +- docs/api/wn.morphy.rst | 98 ++++++++++++++++++---- docs/api/wn.rst | 6 -- tests/morphy_test.py | 43 +++++++--- tests/wordnet_test.py | 21 ++--- wn/__init__.py | 2 - wn/_core.py | 138 ++++++++----------------------- wn/_types.py | 16 ++-- wn/morphy.py | 183 +++++++++++++++++++++-------------------- 9 files changed, 266 insertions(+), 246 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfa5f2e..a4014d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ only by instantiating a `wn.Wordnet` object ([#105]) * `wn.morphy` ([#19]) * `wn.Wordnet.lemmatizer` attribute ([#8]) -* `wn.Lemmatizer` class ([#8]) ### Changed @@ -24,8 +23,8 @@ - Initialization now takes a `normalizer` parameter ([#105]) - Initialization now takes a `lemmatizer` parameter ([#8]) - `Wordnet.words()`, `Wordnet.senses()` and `Wordnet.synsets()` now - use any specified `wn.Lemmatizer` instance or normalization - function to expand queries on word forms ([#105]) + use any specified lemmatization or normalization functions to + expand queries on word forms ([#105]) ### Fixed diff --git a/docs/api/wn.morphy.rst b/docs/api/wn.morphy.rst index f4a3b65..def3ab4 100644 --- a/docs/api/wn.morphy.rst +++ b/docs/api/wn.morphy.rst @@ -4,30 +4,98 @@ wn.morphy .. automodule:: wn.morphy -System Flags ------------- +.. seealso:: -The following flags may be passed to the ``system`` parameter of -:class:`Morphy` to adjust the patterns and behaviors it uses. Note -that in order to use these flags, the Morphy instance must be assigned -to the :class:`wn.Wordnet` instances after initialization: + The Princeton WordNet `documentation + `_ for the + original implementation of Morphy. + + +Initialized and Uninitialized Morphy +------------------------------------ + +There are two ways of using Morphy in Wn: initialized and +uninitialized. + +Unintialized Morphy is a simple callable that returns lemma +*candidates* for some given wordform. That is, the results might not +be valid lemmas, but this is not a problem in practice because +subsequent queries against the database will filter out the invalid +ones. This callable is obtained by creating a :class:`Morphy` object +with no arguments: ->>> import wn >>> from wn import morphy ->>> pwn = wn.Wordnet("pwn:3.0") ->>> pwn.lemmatizer = morphy.Morphy(pwn, system=morphy.NLTK) +>>> m = morphy.Morphy() + +As an uninitialized Morphy cannot predict which lemmas in the result +are valid, it always returns the original form and any transformations +it can find for each part of speech: + +>>> m('lemmata', pos='n') # exceptional form +{'n': {'lemmata'}} +>>> m('lemmas', pos='n') # regular morphology with part-of-speech +{'n': {'lemma', 'lemmas'}} +>>> m('lemmas') # regular morphology for any part-of-speech +{None: {'lemmas'}, 'n': {'lemma'}, 'v': {'lemma'}} +>>> m('wolves') # invalid forms may be returned +{None: {'wolves'}, 'n': {'wolf', 'wolve'}, 'v': {'wolve', 'wolv'}} + + +This lemmatizer can also be used with a :class:`wn.Wordnet` object to +expand queries: + +>>> import wn +>>> ewn = wn.Wordnet('ewn:2020') +>>> ewn.words('lemmas') +[] +>>> ewn = wn.Wordnet('ewn:2020', lemmatizer=morphy.Morphy()) +>>> ewn.words('lemmas') +[Word('ewn-lemma-n')] + +An initialized Morphy is created with a :class:`wn.Wordnet` object as +its argument. It then uses the wordnet to build lists of valid lemmas +and exceptional forms (this takes a few seconds). Once this is done, +it will only return lemmas it knows about: + +>>> ewn = wn.Wordnet('ewn:2020') +>>> m = morphy.Morphy(ewn) +>>> m('lemmata', pos='n') # exceptional form +{'n': {'lemma'}} +>>> m('lemmas', pos='n') # regular morphology with part-of-speech +{'n': {'lemma'}} +>>> m('lemmas') # regular morphology for any part-of-speech +{'n': {'lemma'}} +>>> m('wolves') # invalid forms are pre-filtered +{'n': {'wolf'}} + +In order to use an initialized Morphy lemmatizer with a +:class:`wn.Wordnet` object, it must be assigned to the object after +creation: + +>>> ewn = wn.Wordnet('ewn:2020') # default: lemmatizer=None +>>> ewn.words('lemmas') +[] +>>> ewn.lemmatizer = morphy.Morphy(ewn) +>>> ewn.words('lemmas') +[Word('ewn-lemma-n')] + +There is little to no difference in the results obtained from a +:class:`wn.Wordnet` object using an initialized or uninitialized +:class:`Morphy` object, but there may be slightly different +performance profiles for future queries. -.. data:: PWN - Use the behavior implemented in the Princeton WordNet. +Default Morphy Lemmatizer +------------------------- -.. data:: NLTK +As a convenience, an uninitialized Morphy lemmatizer is provided in +this module via the :data:`morphy` member. - Use the behavior implemented in the NLTK. +.. data:: morphy -.. data:: WN + A :class:`Morphy` object created without a :class:`wn.Wordnet` + object. - Use the behavior created for Wn. The Morphy Class ---------------- diff --git a/docs/api/wn.rst b/docs/api/wn.rst index a11d988..242c5c1 100644 --- a/docs/api/wn.rst +++ b/docs/api/wn.rst @@ -278,12 +278,6 @@ The Lexicon Class .. automethod:: extensions -The Lemmatizer Class --------------------- - -.. autoclass:: Lemmatizer - - The wn.config Object -------------------- diff --git a/tests/morphy_test.py b/tests/morphy_test.py index 80b3c8e..64baa92 100644 --- a/tests/morphy_test.py +++ b/tests/morphy_test.py @@ -2,16 +2,39 @@ import pytest import wn -from wn.morphy import Morphy +from wn import morphy + + +def test_morphy_uninitialized(): + # An unintialized Morphy isn't very bright, but it starts up + # fast. It relies on the database to filter bad items. + m = morphy.Morphy() + assert m('example', 'n') == {'n': {'example'}} + assert m('examples', 'n') == {'n': {'examples', 'example'}} + assert m('examples', 'v') == {'v': {'examples', 'example', 'exampl'}} + assert m('exemplifying', 'n') == {'n': {'exemplifying'}} + assert m('exemplifying', 'v') == {'v': {'exemplifying', 'exemplify', 'exemplifye'}} + assert m('data', 'n') == {'n': {'data'}} + assert m('datums', 'n') == {'n': {'datums', 'datum'}} # expected false positive + assert m('examples', None) == {None: {'examples'}, + 'n': {'example'}, + 'v': {'example', 'exampl'}} + assert m('exemplifying', None) == {None: {'exemplifying'}, + 'v': {'exemplify', 'exemplifye'}} + assert m('data', None) == {None: {'data'}} @pytest.mark.usefixtures('mini_db') -def test_morphy(): - w = wn.Wordnet('test-en:1', lemmatizer=Morphy) - m = w.lemmatizer - assert list(m('examples', 'n')) == ['example'] - assert list(m('examples', 'v')) == [] - assert list(m('exemplifying', 'n')) == [] - assert list(m('exemplifying', 'v')) == ['exemplify'] - assert list(m('data', 'n')) == ['datum'] - assert list(m('datums', 'n')) == ['datum'] # expected false positive +def test_morphy_initialized(): + w = wn.Wordnet('test-en:1') + m = morphy.Morphy(wordnet=w) + assert m('example', 'n') == {'n': {'example'}} + assert m('examples', 'n') == {'n': {'example'}} + assert m('examples', 'v') == {} + assert m('exemplifying', 'n') == {} + assert m('exemplifying', 'v') == {'v': {'exemplify'}} + assert m('data', 'n') == {'n': {'datum'}} + assert m('datums', 'n') == {'n': {'datum'}} # expected false positive + assert m('examples', None) == {'n': {'example'}} + assert m('exemplifying', None) == {'v': {'exemplify'}} + assert m('data', None) == {'n': {'datum'}} diff --git a/tests/wordnet_test.py b/tests/wordnet_test.py index 6889672..a659115 100644 --- a/tests/wordnet_test.py +++ b/tests/wordnet_test.py @@ -60,26 +60,23 @@ def test_wordnet_lemmatize(): assert en.words('exemplifying') == en.words('exemplify') assert en.words('data') == en.words('datum') - en = wn.Wordnet('test-en', lemmatizer=None) - assert en.words('examples') == [] - assert en.words('exemplifying') == [] - assert en.words('data') == [] - - en = wn.Wordnet('test-en') - en.lemmatizer = None + en = wn.Wordnet('test-en', search_all_forms=False) assert en.words('examples') == [] assert en.words('exemplifying') == [] assert en.words('data') == [] def morphy_lite(form, pos): - yield form - if pos == 'n' and form.endswith('s'): - yield form[:-1] + result = {pos: {form}} + if pos in ('n', None) and form.endswith('s'): + result.setdefault('n', set()).add(form[:-1]) + return result - en = wn.Wordnet('test-en', lemmatizer=morphy_lite) + en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=False) + assert en.words('examples', pos='n') == en.words('example') assert en.words('examples') == en.words('example') assert en.words('exemplifying') == [] assert en.words('data') == [] - morphy_lite.search_all_forms = True + en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=True) assert en.words('data') == en.words('datum') + assert en.words('exemplifying') == en.words('exemplify') diff --git a/wn/__init__.py b/wn/__init__.py index 59569dd..2fe67e7 100644 --- a/wn/__init__.py +++ b/wn/__init__.py @@ -29,7 +29,6 @@ 'ili', 'ilis', 'ILI', - 'Lemmatizer', 'Error', 'DatabaseError', 'WnWarning', @@ -47,7 +46,6 @@ sense, senses, Sense, Count, synset, synsets, Synset, ili, ilis, ILI, - Lemmatizer, Wordnet ) diff --git a/wn/_core.py b/wn/_core.py index 45a0a88..7952322 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -1,6 +1,5 @@ from typing import ( - Union, Type, TypeVar, Callable, @@ -10,13 +9,15 @@ Dict, Set, Iterator, - Collection, ) import warnings import wn -from wn._types import Metadata, NormalizeFunction, LemmatizerInstance -from wn.constants import PARTS_OF_SPEECH +from wn._types import ( + Metadata, + NormalizeFunction, + LemmatizeFunction, +) from wn._util import flatten, normalize_form from wn._db import NON_ROWID from wn._queries import ( @@ -1035,51 +1036,6 @@ def translate(self, lexicon: str = None, *, lang: str = None) -> List['Sense']: for t_sense in t_synset.senses()] -class Lemmatizer: - """Class for expanding queries over word forms. - - This class, while intended for morphological lemmatization, is - more broadly construed as a tool for expanding word forms so they - better match the lemmas in a lexicon. This particular class serves - as the default lemmatizer for any wordnet, but users are expected - to use a subclass of this one for any custom behavior, such as - that provided by :class:`wn.morphy.Morphy`. - - Instances of this class and subclasses are callables with a - signature equivalent to the following: - - def lemmatize(form: str, pos: str) -> Iterator[str]: - ... - - The default behavior provided by this class does two things: (1) - yields the original word form unchanged, and (2) declares that - the word form may be searched against non-lemmatic forms in the - lexicon as well. - - Arguments: - wordnet: An instance of a :class:`Wordnet` - Attributes: - search_all_forms: When :python:`False`, word forms are only - searched against lemmas in the database, otherwise they - are searched against all word forms. - parts_of_speech: A collection of the parts of speech relevant - for the lemmatizer. - - """ - - __slots__ = '_wordnet', - __module__ = 'wn' - - search_all_forms = True - parts_of_speech: Collection[str] = PARTS_OF_SPEECH - - def __init__(self, wordnet: 'Wordnet'): - self._wordnet = wordnet - - def __call__(self, form: str, pos: str) -> Iterator[str]: - yield form - - # Useful for factory functions of Word, Sense, or Synset C = TypeVar('C', Word, Sense, Synset) @@ -1114,23 +1070,32 @@ class Wordnet: to :python:`None` disables normalization and forces exact-match searching. - The *lemmatizer* argument may be :python:`None`, which disables - lemmatizer-based query expansion, a subclass of - :class:`Lemmatizer` which will be instantiated with the - :class:`Wordnet` object as its argument (during this - instantiation, the :attr:`Wordnet.lemmatizer` attribute will be - :python:`None`), or a callable matching the call signature of a - :class:`Lemmatizer` instance. - + The *lemmatizer* argument may be :python:`None`, which is the + default and disables lemmatizer-based query expansion, or a + callable that takes a word form and optional part of speech and + returns base forms of the original word. To support lemmatizers + that use the wordnet for instantiation, such as :mod:`wn.morphy`, + the lemmatizer may be assigned to the :attr:`lemmatizer` attribute + after creation. + + If the *search_all_forms* argument is :python:`True` (the + default), searches of word forms consider all forms in the + lexicon; if :python:`False`, only lemmas are searched. Non-lemma + forms may include, depending on the lexicon, morphological + exceptions, alternate scripts or spellings, etc. + + .. _BCP 47: https://en.wikipedia.org/wiki/IETF_language_tag .. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms Attributes: - lemmatizer: A :class:`Lemmatizer` instance or :python:`None` + + lemmatizer: A lemmatization function or :python:`None`. """ __slots__ = ('_lexicons', '_lexicon_ids', '_expanded', '_expanded_ids', - '_default_mode', '_normalizer', 'lemmatizer') + '_default_mode', '_normalizer', 'lemmatizer', + '_search_all_forms',) __module__ = 'wn' def __init__( @@ -1140,9 +1105,8 @@ def __init__( lang: str = None, expand: str = None, normalizer: Optional[NormalizeFunction] = normalize_form, - lemmatizer: Optional[ - Union[Type[Lemmatizer], LemmatizerInstance] - ] = Lemmatizer, + lemmatizer: Optional[LemmatizeFunction] = None, + search_all_forms: bool = True, ): # default mode means any lexicon is searched or expanded upon, # but relation traversals only target the source's lexicon @@ -1178,15 +1142,8 @@ def __init__( self._expanded_ids: Tuple[int, ...] = tuple(lx._id for lx in self._expanded) self._normalizer = normalizer - - # setting the lemmatizer should be done last as its - # instantiation may use the Wordnet object - self.lemmatizer: Optional[LemmatizerInstance] = None - if lemmatizer: - if isinstance(lemmatizer, type): - self.lemmatizer = lemmatizer(self) - else: - self.lemmatizer = lemmatizer + self.lemmatizer = lemmatizer + self._search_all_forms = search_all_forms def lexicons(self): """Return the list of lexicons covered by this wordnet.""" @@ -1312,6 +1269,7 @@ def _find_helper( """ kwargs: Dict = { 'lexicon_rowids': w._lexicon_ids, + 'search_all_forms': w._search_all_forms, } if ili is not None: kwargs['ili'] = ili @@ -1322,12 +1280,15 @@ def _find_helper( for data in query_func(pos=pos, **kwargs)] # if there's a form, we may need to lemmatize and normalize - lemmatizer = w.lemmatizer + lemmatize = w.lemmatizer normalize = w._normalizer - kwargs['search_all_forms'] = getattr(lemmatizer, 'search_all_forms', False) kwargs['normalized'] = bool(normalize) - forms = _get_forms(form, pos, lemmatizer) + forms = lemmatize(form, pos) if lemmatize else {} + # if no lemmatizer or word not covered by lemmatizer, back off to + # the original form and pos + if not forms: + forms = {pos: {form}} results = [ cls(*data, w) # type: ignore @@ -1345,35 +1306,6 @@ def _find_helper( return results -def _get_forms( - form: str, - pos: Optional[str], - lemmatizer: Optional[LemmatizerInstance] -) -> Dict[Optional[str], List[str]]: - # special case for default Lemmatizer class - if not lemmatizer or lemmatizer.__class__ is Lemmatizer: - return {pos: [form]} - - pos_set: Set[str] - if pos is None: - pos_set = getattr(lemmatizer, 'parts_of_speech', PARTS_OF_SPEECH) - else: - pos_set = {pos} - - forms: Dict[Optional[str], List[str]] = {} - for _pos in pos_set: - _forms = list(lemmatizer(form, _pos)) - if _forms: - forms[_pos] = _forms - - # if the lemmatizer cannot find anything, just return the original - # word and pos as a last resort - if not forms: - forms[pos] = [form] - - return forms - - def projects() -> List[Dict]: """Return the list of indexed projects. diff --git a/wn/_types.py b/wn/_types.py index c427aa4..ef153fb 100644 --- a/wn/_types.py +++ b/wn/_types.py @@ -1,5 +1,7 @@ -from typing import Union, Callable, Mapping, Sequence, Dict, Any, Iterator +from typing import ( + Optional, Union, Callable, Mapping, Sequence, Dict, Set, Any, +) from pathlib import Path # For functions taking a filesystem path as a str or a pathlib.Path @@ -11,9 +13,13 @@ # User-facing metadata representation Metadata = Dict[str, Any] -# A function that returns a normalized word form for a given word form +# A callable that returns a normalized word form for a given word form NormalizeFunction = Callable[[str], str] -# A callable class or function that yields lemmatized words for a -# given word form and part of speech -LemmatizerInstance = Callable[[str, str], Iterator[str]] +# Lemmatization returns a mapping of parts of speech (or None) to +# lists of wordforms that are potential lemmas for some query word +LemmatizeResult = Dict[Optional[str], Set[str]] + +# A callable that returns a LemmatizationResult for a given word form +# and optional part of speech +LemmatizeFunction = Callable[[str, Optional[str]], LemmatizeResult] diff --git a/wn/morphy.py b/wn/morphy.py index c5052bb..6f7eeca 100644 --- a/wn/morphy.py +++ b/wn/morphy.py @@ -1,53 +1,44 @@ -"""An implementation of the Morphy lemmatization system for English. - -.. seealso:: - - The Princeton WordNet `documentation - `_ for the - original implementation. +"""A simple English lemmatizer that finds and removes known suffixes. """ -from typing import Iterator, Dict, Set, List, Tuple -import warnings +from typing import Optional, Dict, Set, List, Tuple from enum import Flag, auto import wn -from wn.constants import NOUN, VERB, ADJ, ADJ_SAT, ADV +from wn._types import LemmatizeResult +from wn.constants import NOUN, VERB, ADJ, ADJ_SAT, ADV, PARTS_OF_SPEECH POSExceptionMap = Dict[str, Set[str]] ExceptionMap = Dict[str, POSExceptionMap] -class System(Flag): - """Flags to track suffix rules in various implementations of Morphy. - - These are available at the module level, as well (e.g., `morphy.PWN`). - """ +class _System(Flag): + """Flags to track suffix rules in various implementations of Morphy.""" PWN = auto() NLTK = auto() WN = auto() ALL = PWN | NLTK | WN -PWN = System.PWN -NLTK = System.NLTK -WN = System.WN -_ALL = System.ALL +_PWN = _System.PWN +_NLTK = _System.NLTK +_WN = _System.WN +_ALL = _System.ALL -Rule = Tuple[str, str, System] +Rule = Tuple[str, str, _System] DETACHMENT_RULES: Dict[str, List[Rule]] = { NOUN: [ ("s", "", _ALL), - ("ces", "x", WN), + ("ces", "x", _WN), ("ses", "s", _ALL), - ("ves", "f", NLTK | WN), - ("ives", "ife", WN), + ("ves", "f", _NLTK | _WN), + ("ives", "ife", _WN), ("xes", "x", _ALL), - ("xes", "xis", WN), + ("xes", "xis", _WN), ("zes", "z", _ALL), ("ches", "ch", _ALL), ("shes", "sh", _ALL), @@ -75,88 +66,100 @@ class System(Flag): DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ] -class Morphy(wn.Lemmatizer): +class Morphy: """The Morphy lemmatizer class. + Objects of this class are callables that take a wordform and an + optional part of speech and return a dictionary mapping parts of + speech to lemmas. If objects of this class are not created with a + :class:`wn.Wordnet` object, the returned lemmas may be invalid. + Arguments: wordnet: optional :class:`wn.Wordnet` instance Example: >>> import wn - >>> from wn.constants import NOUN >>> from wn.morphy import Morphy >>> ewn = wn.Wordnet('ewn:2020') >>> m = Morphy(ewn) - >>> list(m('axes', NOUN)) - ['axe', 'ax', 'axis'] - >>> list(m('geese', NOUN)) - ['goose'] + >>> m('axes', pos='n') + {'n': {'axe', 'ax', 'axis'}} + >>> m('geese', pos='n') + {'n': {'goose'}} + >>> m('gooses') + {'n': {'goose'}, 'v': {'goose'}} + >>> m('goosing') + {'v': {'goose'}} + """ - search_all_forms = False - parts_of_speech = {NOUN, VERB, ADJ, ADJ_SAT, ADV} - - def __init__(self, wordnet: wn.Wordnet, system: System = WN): - if any(lex.language != 'en' for lex in wordnet.lexicons()): - warnings.warn( - 'Morphy is not intended for use with non-English wordnets', - wn.WnWarning - ) - self._wordnet = wordnet - self._system = system + def __init__(self, wordnet: Optional[wn.Wordnet] = None): self._rules = { - pos: [rule for rule in rules if rule[2] & system] + pos: [rule for rule in rules if rule[2] & _System.WN] for pos, rules in DETACHMENT_RULES.items() } - self._exceptions: ExceptionMap = { - pos: {} for pos in self.parts_of_speech - } - self._all_lemmas: Dict[str, Set[str]] = { - pos: set() for pos in self.parts_of_speech - } - self._build() - - def __call__(self, form: str, pos: str) -> Iterator[str]: - if pos not in self.parts_of_speech: - return - - exceptions = self._exceptions[pos] - rules = self._rules[pos] - pos_lemmas = self._all_lemmas[pos] - - # original lemma - if form in pos_lemmas: - yield form + exceptions: ExceptionMap = {pos: {} for pos in PARTS_OF_SPEECH} + all_lemmas: Dict[str, Set[str]] = {pos: set() for pos in PARTS_OF_SPEECH} + if wordnet: + for word in wordnet.words(): + pos = word.pos + pos_exc = exceptions[pos] + lemma, *others = word.forms() + # store every lemma whether it has other forms or not + all_lemmas[pos].add(lemma) + # those with other forms map to the original lemmas + for other in others: + if other in pos_exc: + pos_exc[other].add(lemma) + else: + pos_exc[other] = {lemma} + self._initialized = True + else: + self._initialized = False + self._exceptions = exceptions + self._all_lemmas = all_lemmas + + def __call__(self, form: str, pos: Optional[str] = None) -> LemmatizeResult: + result = {} + if not self._initialized: + result[pos] = {form} # always include original when not initialized + + if pos is None: + pos_list = list(DETACHMENT_RULES) + elif pos in DETACHMENT_RULES: + pos_list = [pos] + else: + pos_list = [] # not handled by morphy + + no_pos_forms = result.get(None, set()) # avoid unnecessary duplicates + for _pos in pos_list: + candidates = self._morphstr(form, _pos) - no_pos_forms + if candidates: + result.setdefault(_pos, set()).update(candidates) + + return result + + def _morphstr(self, form: str, pos: str) -> Set[str]: + candidates: Set[str] = set() + + initialized = self._initialized + if initialized: + all_lemmas = self._all_lemmas[pos] + if form in all_lemmas: + candidates.add(form) + candidates.update(self._exceptions[pos].get(form, set())) + else: + all_lemmas = set() + + for suffix, repl, _ in self._rules[pos]: + # avoid applying rules that perform full suppletion + if form.endswith(suffix) and len(suffix) < len(form): + candidate = f'{form[:-len(suffix)]}{repl}' + if not initialized or candidate in all_lemmas: + candidates.add(candidate) - seen = set() # don't yield the same form more than once per pos + return candidates - # lemmas from exceptions - for _form in exceptions.get(form, []): - seen.add(_form) - yield _form - # lemmas from morphological detachment - for suffix, repl, _ in rules: - # avoid applying rules that perform full suppletion - if form.endswith(suffix) and len(suffix) < len(form): - _form = f'{form[:-len(suffix)]}{repl}' - if _form in pos_lemmas and _form not in seen: - seen.add(_form) - yield _form - - def _build(self) -> None: - exceptions = self._exceptions - all_lemmas = self._all_lemmas - for word in self._wordnet.words(): - pos = word.pos - pos_exc = exceptions[pos] - lemma, *others = word.forms() - # store every lemma whether it has other forms or not - all_lemmas[pos].add(lemma) - # those with other forms map to the original lemmas - for other in others: - if other in pos_exc: - pos_exc[other].add(lemma) - else: - pos_exc[other] = {lemma} +morphy = Morphy() From 66831dd91e18ff7eaaf972d3ed535d520e209095 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 11 May 2021 13:43:42 +0800 Subject: [PATCH 20/39] Various documentation improvments. --- docs/conf.py | 2 ++ docs/guides/lexicons.rst | 12 +++++++----- wn/_add.py | 8 +++++--- wn/_core.py | 10 +++++----- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 65e4e43..14b492f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,6 +58,8 @@ :class: highlight """ +smartquotes = False + # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/docs/guides/lexicons.rst b/docs/guides/lexicons.rst index 5337b9b..46d78cc 100644 --- a/docs/guides/lexicons.rst +++ b/docs/guides/lexicons.rst @@ -88,7 +88,11 @@ Adding Local Lexicons Lexicons can be added from local files with :py:func:`wn.add`: ->>> wn.add('~/data/omw/nobwn/') +>>> wn.add('~/data/omw/nob/nobwn.xml') + +Or, with the directory as a package: + +>>> wn.add('~/data/omw/nob/') Listing Installed Lexicons -------------------------- @@ -112,11 +116,9 @@ Lexicons can be removed from the database with :py:func:`wn.remove`: Note that this removes a single lexicon and not a project, so if, for instance, you've installed a multi-lexicon project like ``omw``, you -will need to remove each lexicon individually: +will need to remove each lexicon individually or use a star specifier: ->>> for lex in wn.lexicons(): -... if lex.version == '1.3+omw': -... wn.remove(f'{lex.id}:{lex.version}') +>>> wn.remove('*:1.3+omw') WN-LMF Files, Packages, and Collections --------------------------------------- diff --git a/wn/_add.py b/wn/_add.py index 5666569..94cd6ff 100644 --- a/wn/_add.py +++ b/wn/_add.py @@ -667,13 +667,15 @@ def remove( The *lexicon* argument is a :ref:`lexicon specifier `. Note that this removes a lexicon and not a project, so the lexicons of projects containing multiple lexicons - will need to be removed individually. + will need to be removed individually or, if applicable, a star + specifier. The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. - >>> wn.remove('ewn:2019') + >>> wn.remove('ewn:2019') # removes a single lexicon + >>> wn.remove('*:1.3+omw') # removes all lexicons with version 1.3+omw """ if progress_handler is None: @@ -697,7 +699,7 @@ def remove( extra = f' (and {len(extensions)} extension(s))' if extensions else '' progress.set(status=f'{spec}', count=0) conn.execute('DELETE from lexicons WHERE rowid = ?', (rowid,)) - progress.flash(f'Removed {spec}{extra}') + progress.flash(f'Removed {spec}{extra}\n') finally: progress.close() diff --git a/wn/_core.py b/wn/_core.py index 7952322..3290944 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -1046,11 +1046,11 @@ class Wordnet: A wordnet object acts essentially as a filter by first selecting matching lexicons and then searching only within those lexicons - for later queries. On instantiation, a *lang* argument is a BCP47 - language code that restricts the selected lexicons to those whose - language matches the given code. A *lexicon* argument is a + for later queries. On instantiation, a *lang* argument is a `BCP + 47`_ language code that restricts the selected lexicons to those + whose language matches the given code. A *lexicon* argument is a space-separated list of lexicon specifiers that more directly - select lexicons by their ID and version; this is preferable when + selects lexicons by their ID and version; this is preferable when there are multiple lexicons in the same language or multiple version with the same ID. @@ -1062,7 +1062,7 @@ class Wordnet: queries. Setting *expand* to an empty string (:python:`expand=''`) disables expand lexicons. - The *normalizer* argument takes a function that normalizes word + The *normalizer* argument takes a callable that normalizes word forms in order to expand the search. The default function downcases the word and removes diacritics via NFKD_ normalization so that, for example, searching for *san josé* in the English From 7ca62c8e82a3d9670d187feee331a555573cf356 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 3 May 2021 17:05:29 +0800 Subject: [PATCH 21/39] Add initial web API Part of #116 --- pyproject.toml | 3 + wn/web.py | 183 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 wn/web.py diff --git a/pyproject.toml b/pyproject.toml index 9ff9979..ce14232 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,9 @@ classifiers = [ Documentation = "https://wn.readthedocs.io" [tool.flit.metadata.requires-extra] +web = [ + "starlette", +] test = [ "pytest", "mypy", diff --git a/wn/web.py b/wn/web.py new file mode 100644 index 0000000..afc806f --- /dev/null +++ b/wn/web.py @@ -0,0 +1,183 @@ + +from functools import wraps + +from starlette.applications import Starlette +from starlette.responses import JSONResponse +from starlette.routing import Route + +import wn + +DEFAULT_PAGINATION_LIMIT = 50 + + +def paginate(name, proto): + + def paginate_wrapper(func): + + @wraps(func) + async def _paginate_wrapper(request): + query = request.query_params + offset = abs(int(query.get('offset', 0))) + limit = abs(int(query.get('limit', DEFAULT_PAGINATION_LIMIT))) + + obj = await func(request) + + seq = obj[name] + obj[name] = [proto(x, request) for x in seq[offset:offset+limit]] + obj['offset'] = offset + obj['limit'] = limit + obj['total'] = len(seq) + + return JSONResponse(obj) + + return _paginate_wrapper + + return paginate_wrapper + + +# Data-making functions + +def make_lexicon(lex, request): + return { + 'id': lex.id, + 'version': lex.version, + 'label': lex.label, + 'language': lex.language, + 'license': lex.license, + 'synsets': request.url_for('synsets', lexicon=lex.specifier()), + } + + +def make_word(w, request): + return { + 'id': w.id, + 'pos': w.pos, + 'lemma': w.lemma(), + 'forms': w.forms(), + 'senses': [ + request.url_for('sense', lexicon=s.lexicon().specifier(), sense=s.id) + for s in w.senses() + ], + 'lexicon': request.url_for( + 'lexicons', lexicon=w.lexicon().specifier() + ), + } + + +def make_sense(s, request): + word = s.word() + synset = s.synset() + return { + 'id': s.id, + 'word': request.url_for( + 'word', lexicon=word.lexicon().specifier(), word=word.id + ), + 'synset': request.url_for( + 'synset', lexicon=synset.lexicon().specifier(), synset=synset.id + ), + 'lexicon': request.url_for( + 'lexicons', lexicon=s.lexicon().specifier() + ), + } + + +def make_synset(ss, request): + return { + 'id': ss.id, + 'pos': ss.pos, + 'ili': ss._ili, + 'members': [ + request.url_for('sense', lexicon=s.lexicon().specifier(), sense=s.id) + for s in ss.senses() + ], + # 'relations': [], + 'lexicon': request.url_for( + 'lexicons', lexicon=ss.lexicon().specifier() + ), + } + + +@paginate('lexicons', make_lexicon) +async def lexicons(request): + query = request.query_params + lexicon = request.path_params.get('lexicon') or query.get('lexicon') + _lexicons = wn.lexicons( + lexicon=lexicon, + lang=query.get('lang'), + ) + return {'lexicons': _lexicons} + + +@paginate('words', make_word) +async def words(request): + query = request.query_params + lexicon = request.path_params.get('lexicon') or query.get('lexicon') + _words = wn.words( + form=query.get('form'), + pos=query.get('pos'), + lexicon=lexicon, + lang=query.get('lang'), + ) + return {'words': _words} + + +async def word(request): + path_params = request.path_params + word = wn.word(path_params['word'], lexicon=path_params['lexicon']) + return JSONResponse({'word': make_word(word, request)}) + + +@paginate('senses', make_sense) +async def senses(request): + query = request.query_params + lexicon = request.path_params.get('lexicon') or query.get('lexicon') + _senses = wn.senses( + form=query.get('form'), + pos=query.get('pos'), + lexicon=lexicon, + lang=query.get('lang'), + ) + return {'senses': _senses} + + +async def sense(request): + path_params = request.path_params + sense = wn.sense(path_params['sense'], lexicon=path_params['lexicon']) + return JSONResponse({'sense': make_sense(sense, request)}) + + +@paginate('synsets', make_synset) +async def synsets(request): + query = request.query_params + lexicon = request.path_params.get('lexicon') or query.get('lexicon') + _synsets = wn.synsets( + form=query.get('form'), + pos=query.get('pos'), + ili=query.get('ili'), + lexicon=lexicon, + lang=query.get('lang'), + ) + return {'synsets': _synsets} + + +async def synset(request): + path_params = request.path_params + synset = wn.synset(path_params['synset'], lexicon=path_params['lexicon']) + return JSONResponse({'synset': make_synset(synset, request)}) + + +routes = [ + Route('/lexicons', endpoint=lexicons), + Route('/lexicons/{lexicon}', endpoint=lexicons), + Route('/lexicons/{lexicon}/words', endpoint=words), + Route('/lexicons/{lexicon}/words/{word}', endpoint=word), + Route('/lexicons/{lexicon}/senses', endpoint=senses), + Route('/lexicons/{lexicon}/senses/{sense}', endpoint=sense), + Route('/lexicons/{lexicon}/synsets', endpoint=synsets), + Route('/lexicons/{lexicon}/synsets/{synset}', endpoint=synset), + Route('/words', endpoint=words), + Route('/senses', endpoint=senses), + Route('/synsets', endpoint=synsets), +] + +app = Starlette(debug=True, routes=routes) From 4dc3bdc5f079c364a6179ae015432b5e20c5624f Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Fri, 28 May 2021 10:45:17 +0800 Subject: [PATCH 22/39] Follow JSON-API spec, add single lexicon endpoint --- wn/web.py | 217 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 160 insertions(+), 57 deletions(-) diff --git a/wn/web.py b/wn/web.py index afc806f..2f1c89a 100644 --- a/wn/web.py +++ b/wn/web.py @@ -1,5 +1,6 @@ from functools import wraps +from urllib.parse import urlsplit, parse_qs, urlencode from starlette.applications import Starlette from starlette.responses import JSONResponse @@ -10,23 +11,30 @@ DEFAULT_PAGINATION_LIMIT = 50 -def paginate(name, proto): +def paginate(proto): def paginate_wrapper(func): @wraps(func) async def _paginate_wrapper(request): - query = request.query_params - offset = abs(int(query.get('offset', 0))) - limit = abs(int(query.get('limit', DEFAULT_PAGINATION_LIMIT))) + url = str(request.url) + query = dict(request.query_params) + offset = abs(int(query.pop('offset', 0))) + limit = abs(int(query.pop('limit', DEFAULT_PAGINATION_LIMIT))) obj = await func(request) - - seq = obj[name] - obj[name] = [proto(x, request) for x in seq[offset:offset+limit]] - obj['offset'] = offset - obj['limit'] = limit - obj['total'] = len(seq) + total = len(obj['data']) + + links = { + 'first': replace_query_params(url, offset=0), + 'last': replace_query_params(url, offset=(total//limit)*limit), + 'prev': replace_query_params(url, offset=max(0, offset - limit)), + 'next': replace_query_params(url, offset=offset + limit), + } + obj['data'] = [proto(x, request) + for x in obj['data'][offset:offset+limit]] + obj.setdefault('meta', {}).update(total=total) + obj.setdefault('links', {}).update(links) return JSONResponse(obj) @@ -35,80 +43,175 @@ async def _paginate_wrapper(request): return paginate_wrapper +def replace_query_params(url: str, **params) -> str: + u = urlsplit(url) + q = parse_qs(u.query) + q.update(params) + qs = urlencode(q, doseq=True) + return u._replace(query=qs).geturl() + + # Data-making functions +def _url_for_obj(request, name, obj, lexicon=None): + if lexicon is None: + lexicon = obj.lexicon().specifier() + kwargs = { + 'lexicon': lexicon, + name: obj.id + } + return request.url_for(name, **kwargs) + + def make_lexicon(lex, request): + spec = lex.specifier() return { 'id': lex.id, - 'version': lex.version, - 'label': lex.label, - 'language': lex.language, - 'license': lex.license, - 'synsets': request.url_for('synsets', lexicon=lex.specifier()), + 'type': 'lexicon', + 'attributes': { + 'version': lex.version, + 'label': lex.label, + 'language': lex.language, + 'license': lex.license, + }, + 'links': { + 'self': request.url_for('lexicon', lexicon=spec) + }, + 'relationships': { + 'words': { + 'links': {'related': request.url_for('words', lexicon=spec)}, + }, + 'synsets': { + 'links': {'related': request.url_for('synsets', lexicon=spec)}, + }, + 'senses': { + 'links': {'related': request.url_for('senses', lexicon=spec)}, + }, + } } def make_word(w, request): + lex_spec = w.lexicon().specifier() + senses = w.senses() + synsets = w.synsets() return { 'id': w.id, - 'pos': w.pos, - 'lemma': w.lemma(), - 'forms': w.forms(), - 'senses': [ - request.url_for('sense', lexicon=s.lexicon().specifier(), sense=s.id) - for s in w.senses() - ], - 'lexicon': request.url_for( - 'lexicons', lexicon=w.lexicon().specifier() - ), + 'type': 'word', + 'attributes': { + 'pos': w.pos, + 'lemma': w.lemma(), + 'forms': w.forms(), + }, + 'links': { + 'self': _url_for_obj(request, 'word', w, lexicon=lex_spec) + }, + 'relationships': { + 'senses': {'data': [{'type': 'sense', 'id': s.id} for s in senses]}, + 'synsets': {'data': [{'type': 'synset', 'id': ss.id} for ss in synsets]}, + 'lexicon': { + 'links': {'related': request.url_for('lexicon', lexicon=lex_spec)} + } + }, + 'included': [ + {'type': 'sense', + 'id': s.id, + 'links': {'self': _url_for_obj(request, 'sense', s)}} + for s in senses + ] + [ + {'type': 'synset', + 'id': ss.id, + 'links': {'self': _url_for_obj(request, 'synset', ss)}} + for ss in synsets + ] } def make_sense(s, request): - word = s.word() - synset = s.synset() + lex_spec = s.lexicon().specifier() + w = s.word() + ss = s.synset() return { 'id': s.id, - 'word': request.url_for( - 'word', lexicon=word.lexicon().specifier(), word=word.id - ), - 'synset': request.url_for( - 'synset', lexicon=synset.lexicon().specifier(), synset=synset.id - ), - 'lexicon': request.url_for( - 'lexicons', lexicon=s.lexicon().specifier() - ), + 'type': 'sense', + 'links': { + 'self': _url_for_obj(request, 'sense', s, lexicon=lex_spec) + }, + 'relationships': { + 'word': {'data': {'type': 'word', 'id': w.id}}, + 'synset': {'data': {'type': 'synset', 'id': ss.id}}, + 'lexicon': { + 'links': {'related': request.url_for('lexicon', lexicon=lex_spec)} + } + }, + 'included': [ + {'type': 'word', + 'id': w.id, + 'attributes': { + 'pos': w.pos, + 'lemma': w.lemma(), + 'forms': w.forms(), + }, + 'links': {'self': _url_for_obj(request, 'word', w)}}, + {'type': 'synset', + 'id': ss.id, + 'links': {'self': _url_for_obj(request, 'synset', ss)}} + ] } def make_synset(ss, request): + lex_spec = ss.lexicon().specifier() + senses = ss.senses() + words = ss.words() return { 'id': ss.id, - 'pos': ss.pos, - 'ili': ss._ili, - 'members': [ - request.url_for('sense', lexicon=s.lexicon().specifier(), sense=s.id) - for s in ss.senses() - ], - # 'relations': [], - 'lexicon': request.url_for( - 'lexicons', lexicon=ss.lexicon().specifier() - ), + 'type': 'synset', + 'attributes': { + 'pos': ss.pos, + 'ili': ss._ili, + }, + 'links': { + 'self': _url_for_obj(request, 'synset', ss, lexicon=lex_spec) + }, + 'relationships': { + 'members': {'data': [{'type': 'sense', 'id': s.id} for s in senses]}, + 'words': {'data': [{'type': 'word', 'id': w.id} for w in words]}, + 'lexicon': { + 'links': {'related': request.url_for('lexicon', lexicon=lex_spec)} + } + }, + 'included': [ + {'type': 'sense', + 'id': s.id, + 'links': {'self': _url_for_obj(request, 'sense', s)}} + for s in senses + ] + [ + {'type': 'word', + 'id': w.id, + 'links': {'self': _url_for_obj(request, 'word', w)}} + for w in words + ] } -@paginate('lexicons', make_lexicon) +@paginate(make_lexicon) async def lexicons(request): query = request.query_params - lexicon = request.path_params.get('lexicon') or query.get('lexicon') _lexicons = wn.lexicons( - lexicon=lexicon, + lexicon=query.get('lexicon'), lang=query.get('lang'), ) - return {'lexicons': _lexicons} + return {'data': _lexicons} + + +async def lexicon(request): + path_params = request.path_params + lex = wn.lexicons(lexicon=path_params['lexicon'])[0] + return JSONResponse({'data': make_lexicon(lex, request)}) -@paginate('words', make_word) +@paginate(make_word) async def words(request): query = request.query_params lexicon = request.path_params.get('lexicon') or query.get('lexicon') @@ -118,7 +221,7 @@ async def words(request): lexicon=lexicon, lang=query.get('lang'), ) - return {'words': _words} + return {'data': _words} async def word(request): @@ -127,7 +230,7 @@ async def word(request): return JSONResponse({'word': make_word(word, request)}) -@paginate('senses', make_sense) +@paginate(make_sense) async def senses(request): query = request.query_params lexicon = request.path_params.get('lexicon') or query.get('lexicon') @@ -137,7 +240,7 @@ async def senses(request): lexicon=lexicon, lang=query.get('lang'), ) - return {'senses': _senses} + return {'data': _senses} async def sense(request): @@ -146,7 +249,7 @@ async def sense(request): return JSONResponse({'sense': make_sense(sense, request)}) -@paginate('synsets', make_synset) +@paginate(make_synset) async def synsets(request): query = request.query_params lexicon = request.path_params.get('lexicon') or query.get('lexicon') @@ -157,7 +260,7 @@ async def synsets(request): lexicon=lexicon, lang=query.get('lang'), ) - return {'synsets': _synsets} + return {'data': _synsets} async def synset(request): @@ -168,7 +271,7 @@ async def synset(request): routes = [ Route('/lexicons', endpoint=lexicons), - Route('/lexicons/{lexicon}', endpoint=lexicons), + Route('/lexicons/{lexicon}', endpoint=lexicon), Route('/lexicons/{lexicon}/words', endpoint=words), Route('/lexicons/{lexicon}/words/{word}', endpoint=word), Route('/lexicons/{lexicon}/senses', endpoint=senses), From 7bb4a288a438cb2b67e1f33a8456ef56bd5e3335 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Fri, 28 May 2021 18:21:51 +0800 Subject: [PATCH 23/39] Add Synset.relations() method This is like Synset.get_related() but returns a mapping of relation names to lists of related synsets. --- tests/relations_test.py | 9 +++++++++ wn/_core.py | 36 +++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/tests/relations_test.py b/tests/relations_test.py index 7d71251..79366c1 100644 --- a/tests/relations_test.py +++ b/tests/relations_test.py @@ -40,6 +40,15 @@ def test_synset_hypernyms_expand_specified(): ] +@pytest.mark.usefixtures('mini_db') +def test_synset_relations(): + w = wn.Wordnet(lang='en') + assert w.synset('test-en-0002-n').relations() == { + 'hypernym': [w.synset('test-en-0001-n')], + 'hyponym': [w.synset('test-en-0004-n')] + } + + @pytest.mark.usefixtures('mini_db_1_1') def test_extension_relations(): # default mode diff --git a/wn/_core.py b/wn/_core.py index 3290944..64fd2b4 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -8,6 +8,7 @@ Tuple, Dict, Set, + Sequence, Iterator, ) import warnings @@ -597,15 +598,27 @@ def lemmas(self) -> List[Form]: """ return [w.lemma() for w in self.words()] + def relations(self, *args: str) -> Dict[str, List['Synset']]: + d: Dict[str, List['Synset']] = {} + for relname, ss in self._get_relations(args): + if relname in d: + d[relname].append(ss) + else: + d[relname] = [ss] + return d + def get_related(self, *args: str) -> List['Synset']: - targets: List['Synset'] = [] + return [ss for _, ss in self._get_relations(args)] + + def _get_relations(self, args: Sequence[str]) -> List[Tuple[str, 'Synset']]: + targets: List[Tuple[str, 'Synset']] = [] lexids = self._get_lexicon_ids() # first get relations from the current lexicon(s) if self._id != NON_ROWID: - relations = get_synset_relations({self._id}, args, lexids) - targets.extend(Synset(*row[2:], self._wordnet) + relations = list(get_synset_relations({self._id}, args, lexids)) + targets.extend((row[0], Synset(*row[2:], self._wordnet)) for row in relations if row[5] in lexids) @@ -616,25 +629,26 @@ def get_related(self, *args: str) -> List['Synset']: # get expanded relation expss = find_synsets(ili=self._ili, lexicon_rowids=expids) rowids = {rowid for _, _, _, _, rowid in expss} - {self._id, NON_ROWID} - relations = get_synset_relations(rowids, args, expids) + relations = list(get_synset_relations(rowids, args, expids)) ilis = {row[4] for row in relations} - {None} # map back to target lexicons - seen = {ss._id for ss in targets} + seen = {ss._id for _, ss in targets} for row in get_synsets_for_ilis(ilis, lexicon_rowids=lexids): if row[-1] not in seen: - targets.append(Synset(*row, self._wordnet)) + targets.append((row[0], Synset(*row, self._wordnet))) # add empty synsets for ILIs without a target in lexids - for ili in (ilis - {tgt._ili for tgt in targets}): - targets.append( - Synset.empty( + unseen_ilis = ilis - {tgt._ili for _, tgt in targets} + for rel_row in relations: + if rel_row[4] in unseen_ilis: + ss = Synset.empty( id=_INFERRED_SYNSET, - ili=ili, + ili=rel_row[4], _lexid=self._lexid, _wordnet=self._wordnet ) - ) + targets.append((rel_row[0], ss)) return targets From 5119d5bc2a89c57f6a0e8b764fbf8d22237abe9b Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sat, 29 May 2021 18:21:31 +0800 Subject: [PATCH 24/39] Fix bug with Sense.get_related() with no relations --- tests/relations_test.py | 8 ++++++++ wn/_queries.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/relations_test.py b/tests/relations_test.py index 79366c1..d6a7223 100644 --- a/tests/relations_test.py +++ b/tests/relations_test.py @@ -49,6 +49,14 @@ def test_synset_relations(): } +@pytest.mark.usefixtures('mini_db') +def test_sense_get_related(): + w = wn.Wordnet('test-en') + assert w.sense('test-en-example-n-0002-01').get_related() == [ + w.sense('test-en-exemplify-v-0003-01') + ] + + @pytest.mark.usefixtures('mini_db_1_1') def test_extension_relations(): # default mode diff --git a/wn/_queries.py b/wn/_queries.py index 87a8d3c..94595b1 100644 --- a/wn/_queries.py +++ b/wn/_queries.py @@ -619,7 +619,7 @@ def get_sense_relations( ) -> Iterator[_Sense_Relation]: params: List = [] constraint = '' - if '*' not in relation_types: + if relation_types and '*' not in relation_types: constraint = f'WHERE type IN ({_qs(relation_types)})' params.extend(relation_types) params.append(source_rowid) From 0bbeb5e28d42e65f7310c4330da20fd6ca9fe641 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sat, 29 May 2021 18:22:48 +0800 Subject: [PATCH 25/39] Add Sense.relations() and some docstrings --- tests/relations_test.py | 8 +++++ wn/_core.py | 67 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/tests/relations_test.py b/tests/relations_test.py index d6a7223..735de79 100644 --- a/tests/relations_test.py +++ b/tests/relations_test.py @@ -57,6 +57,14 @@ def test_sense_get_related(): ] +@pytest.mark.usefixtures('mini_db') +def test_sense_relations(): + w = wn.Wordnet('test-en') + assert w.sense('test-en-example-n-0002-01').relations() == { + 'derivation': [w.sense('test-en-exemplify-v-0003-01')] + } + + @pytest.mark.usefixtures('mini_db_1_1') def test_extension_relations(): # default mode diff --git a/wn/_core.py b/wn/_core.py index 64fd2b4..8ee1d18 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -440,6 +440,9 @@ def __init__( super().__init__(_lexid=_lexid, _id=_id, _wordnet=_wordnet) self.id = id + def relations(self: T, *args: str) -> Dict[str, List[T]]: + raise NotImplementedError + def get_related(self: T, *args: str) -> List[T]: raise NotImplementedError @@ -599,6 +602,26 @@ def lemmas(self) -> List[Form]: return [w.lemma() for w in self.words()] def relations(self, *args: str) -> Dict[str, List['Synset']]: + """Return a mapping of relation names to lists of synsets. + + One or more relation names may be given as positional + arguments to restrict the relations returned. If no such + arguments are given, all relations starting from the synset + are returned. + + See :meth:`get_related` for getting a flat list of related + synsets. + + Example: + + >>> button_rels = wn.synsets('button')[0].relations() + >>> for relname, sslist in button_rels.items(): + ... print(relname, [ss.lemmas() for ss in sslist]) + ... + hypernym [['fixing', 'holdfast', 'fastener', 'fastening']] + hyponym [['coat button'], ['shirt button']] + + """ d: Dict[str, List['Synset']] = {} for relname, ss in self._get_relations(args): if relname in d: @@ -608,6 +631,23 @@ def relations(self, *args: str) -> Dict[str, List['Synset']]: return d def get_related(self, *args: str) -> List['Synset']: + """Return the list of related synsets. + + One or more relation names may be given as positional + arguments to restrict the relations returned. If no such + arguments are given, all relations starting from the synset + are returned. + + This method does not preserve the relation names that lead to + the related synsets. For a mapping of relation names to + related synsets, see :meth:`relations`. + + Example: + + >>> fulcrum = wn.synsets('fulcrum')[0] + >>> [ss.lemmas() for ss in fulcrum.get_related()] + [['pin', 'pivot'], ['lever']] + """ return [ss for _, ss in self._get_relations(args)] def _get_relations(self, args: Sequence[str]) -> List[Tuple[str, 'Synset']]: @@ -998,6 +1038,26 @@ def metadata(self) -> Metadata: """Return the sense's metadata.""" return get_metadata(self._id, 'senses') + def relations(self, *args: str) -> Dict[str, List['Sense']]: + """Return a mapping of relation names to lists of senses. + + One or more relation names may be given as positional + arguments to restrict the relations returned. If no such + arguments are given, all relations starting from the sense + are returned. + + See :meth:`get_related` for getting a flat list of related + senses. + + """ + d: Dict[str, List['Sense']] = {} + for relname, s in self._get_relations(args): + if relname in d: + d[relname].append(s) + else: + d[relname] = [s] + return d + def get_related(self, *args: str) -> List['Sense']: """Return a list of related senses. @@ -1015,10 +1075,13 @@ def get_related(self, *args: str) -> List['Sense']: incoherent """ + return [s for _, s in self._get_relations(args)] + + def _get_relations(self, args: Sequence[str]) -> List[Tuple[str, 'Sense']]: lexids = self._get_lexicon_ids() iterable = get_sense_relations(self._id, args, lexids) - return [Sense(sid, eid, ssid, lexid, rowid, self._wordnet) - for _, _, sid, eid, ssid, lexid, rowid in iterable + return [(relname, Sense(sid, eid, ssid, lexid, rowid, self._wordnet)) + for relname, _, sid, eid, ssid, lexid, rowid in iterable if lexids is None or lexid in lexids] def get_related_synsets(self, *args: str) -> List[Synset]: From c0594f3e97c2fce705f6bb380c45eb3ceac7926c Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sun, 30 May 2021 10:38:17 +0800 Subject: [PATCH 26/39] Add sense/synset relations to web API And some typing of the functions --- wn/web.py | 149 +++++++++++++++++++++++++++--------------------------- 1 file changed, 75 insertions(+), 74 deletions(-) diff --git a/wn/web.py b/wn/web.py index 2f1c89a..cf1015b 100644 --- a/wn/web.py +++ b/wn/web.py @@ -1,10 +1,12 @@ +from typing import Optional, Union from functools import wraps from urllib.parse import urlsplit, parse_qs, urlencode -from starlette.applications import Starlette -from starlette.responses import JSONResponse -from starlette.routing import Route +from starlette.applications import Starlette # type: ignore +from starlette.responses import JSONResponse # type: ignore +from starlette.routing import Route # type: ignore +from starlette.requests import Request # type: ignore import wn @@ -53,7 +55,12 @@ def replace_query_params(url: str, **params) -> str: # Data-making functions -def _url_for_obj(request, name, obj, lexicon=None): +def _url_for_obj( + request: Request, + name: str, + obj: Union[wn.Word, wn.Sense, wn.Synset], + lexicon: Optional[str] = None, +) -> str: if lexicon is None: lexicon = obj.lexicon().specifier() kwargs = { @@ -63,7 +70,7 @@ def _url_for_obj(request, name, obj, lexicon=None): return request.url_for(name, **kwargs) -def make_lexicon(lex, request): +def make_lexicon(lex: wn.Lexicon, request: Request) -> dict: spec = lex.specifier() return { 'id': lex.id, @@ -91,11 +98,9 @@ def make_lexicon(lex, request): } -def make_word(w, request): +def make_word(w: wn.Word, request: Request, basic: bool = False) -> dict: lex_spec = w.lexicon().specifier() - senses = w.senses() - synsets = w.synsets() - return { + d: dict = { 'id': w.id, 'type': 'word', 'attributes': { @@ -105,66 +110,60 @@ def make_word(w, request): }, 'links': { 'self': _url_for_obj(request, 'word', w, lexicon=lex_spec) - }, - 'relationships': { - 'senses': {'data': [{'type': 'sense', 'id': s.id} for s in senses]}, - 'synsets': {'data': [{'type': 'synset', 'id': ss.id} for ss in synsets]}, - 'lexicon': { - 'links': {'related': request.url_for('lexicon', lexicon=lex_spec)} - } - }, - 'included': [ - {'type': 'sense', - 'id': s.id, - 'links': {'self': _url_for_obj(request, 'sense', s)}} - for s in senses - ] + [ - {'type': 'synset', - 'id': ss.id, - 'links': {'self': _url_for_obj(request, 'synset', ss)}} - for ss in synsets - ] + } } + if not basic: + senses = w.senses() + synsets = w.synsets() + lex_link = request.url_for('lexicon', lexicon=lex_spec) + d.update({ + 'relationships': { + 'senses': {'data': [dict(type='sense', id=s.id) for s in senses]}, + 'synsets': {'data': [dict(type='synset', id=ss.id) for ss in synsets]}, + 'lexicon': {'links': {'related': lex_link}} + }, + 'included': ( + [make_sense(s, request, basic=True) for s in senses] + + [make_synset(ss, request, basic=True) for ss in synsets] + ) + }) + return d -def make_sense(s, request): +def make_sense(s: wn.Sense, request: Request, basic: bool = False) -> dict: lex_spec = s.lexicon().specifier() - w = s.word() - ss = s.synset() - return { + d: dict = { 'id': s.id, 'type': 'sense', 'links': { 'self': _url_for_obj(request, 'sense', s, lexicon=lex_spec) - }, - 'relationships': { + } + } + if not basic: + w = s.word() + ss = s.synset() + lex_link = request.url_for('lexicon', lexicon=lex_spec) + relationships: dict = { 'word': {'data': {'type': 'word', 'id': w.id}}, 'synset': {'data': {'type': 'synset', 'id': ss.id}}, - 'lexicon': { - 'links': {'related': request.url_for('lexicon', lexicon=lex_spec)} - } - }, - 'included': [ - {'type': 'word', - 'id': w.id, - 'attributes': { - 'pos': w.pos, - 'lemma': w.lemma(), - 'forms': w.forms(), - }, - 'links': {'self': _url_for_obj(request, 'word', w)}}, - {'type': 'synset', - 'id': ss.id, - 'links': {'self': _url_for_obj(request, 'synset', ss)}} + 'lexicon': {'links': {'related': lex_link}} + } + included = [ + make_word(w, request, basic=True), + make_synset(ss, request, basic=True) ] - } + for relname, slist in s.relations().items(): + relationships[relname] = { + 'data': [dict(type='sense', id=_s.id) for _s in slist] + } + included.extend([make_sense(_s, request, basic=True) for _s in slist]) + d.update({'relationships': relationships, 'included': included}) + return d -def make_synset(ss, request): +def make_synset(ss: wn.Synset, request: Request, basic: bool = False) -> dict: lex_spec = ss.lexicon().specifier() - senses = ss.senses() - words = ss.words() - return { + d: dict = { 'id': ss.id, 'type': 'synset', 'attributes': { @@ -173,26 +172,28 @@ def make_synset(ss, request): }, 'links': { 'self': _url_for_obj(request, 'synset', ss, lexicon=lex_spec) - }, - 'relationships': { - 'members': {'data': [{'type': 'sense', 'id': s.id} for s in senses]}, - 'words': {'data': [{'type': 'word', 'id': w.id} for w in words]}, - 'lexicon': { - 'links': {'related': request.url_for('lexicon', lexicon=lex_spec)} - } - }, - 'included': [ - {'type': 'sense', - 'id': s.id, - 'links': {'self': _url_for_obj(request, 'sense', s)}} - for s in senses - ] + [ - {'type': 'word', - 'id': w.id, - 'links': {'self': _url_for_obj(request, 'word', w)}} - for w in words - ] + } } + if not basic: + senses = ss.senses() + words = ss.words() + lex_link = request.url_for('lexicon', lexicon=lex_spec) + relationships: dict = { + 'members': {'data': [dict(type='sense', id=s.id) for s in senses]}, + 'words': {'data': [dict(type='word', id=w.id) for w in words]}, + 'lexicon': {'links': {'related': lex_link}} + } + included = ( + [make_sense(s, request, basic=True) for s in senses] + + [make_word(w, request, basic=True) for w in words] + ) + for relname, sslist in ss.relations().items(): + relationships[relname] = { + 'data': [dict(type='synset', id=_s.id) for _s in sslist] + } + included.extend([make_synset(_s, request, basic=True) for _s in sslist]) + d.update({'relationships': relationships, 'included': included}) + return d @paginate(make_lexicon) From 41efbc3ea02546fe8170e61c34edfcac80b834b0 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sun, 30 May 2021 10:38:40 +0800 Subject: [PATCH 27/39] Make pagination follow the JSON-API suggestion --- wn/web.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/wn/web.py b/wn/web.py index cf1015b..a9476f8 100644 --- a/wn/web.py +++ b/wn/web.py @@ -18,20 +18,23 @@ def paginate(proto): def paginate_wrapper(func): @wraps(func) - async def _paginate_wrapper(request): + async def _paginate_wrapper(request: Request) -> JSONResponse: url = str(request.url) query = dict(request.query_params) - offset = abs(int(query.pop('offset', 0))) - limit = abs(int(query.pop('limit', DEFAULT_PAGINATION_LIMIT))) + offset = abs(int(query.pop('page[offset]', 0))) + limit = abs(int(query.pop('page[limit]', DEFAULT_PAGINATION_LIMIT))) obj = await func(request) total = len(obj['data']) + last = (total//limit)*limit + prev = max(0, offset - limit) + next = offset + limit links = { - 'first': replace_query_params(url, offset=0), - 'last': replace_query_params(url, offset=(total//limit)*limit), - 'prev': replace_query_params(url, offset=max(0, offset - limit)), - 'next': replace_query_params(url, offset=offset + limit), + 'first': replace_query_params(url, **{'page[offset]': 0}), + 'last': replace_query_params(url, **{'page[offset]': last}), + 'prev': replace_query_params(url, **{'page[offset]': prev}), + 'next': replace_query_params(url, **{'page[offset]': next}), } obj['data'] = [proto(x, request) for x in obj['data'][offset:offset+limit]] From 220fde0ad140f9541d899b1fdd825d61bc77a010 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 31 May 2021 14:37:01 +0800 Subject: [PATCH 28/39] wn.web: reduce prominence of senses, other fixes --- wn/web.py | 60 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/wn/web.py b/wn/web.py index a9476f8..1cecd56 100644 --- a/wn/web.py +++ b/wn/web.py @@ -76,9 +76,10 @@ def _url_for_obj( def make_lexicon(lex: wn.Lexicon, request: Request) -> dict: spec = lex.specifier() return { - 'id': lex.id, + 'id': spec, 'type': 'lexicon', 'attributes': { + # cannot have 'id' as JSON:API disallows it 'version': lex.version, 'label': lex.label, 'language': lex.language, @@ -116,19 +117,16 @@ def make_word(w: wn.Word, request: Request, basic: bool = False) -> dict: } } if not basic: - senses = w.senses() synsets = w.synsets() lex_link = request.url_for('lexicon', lexicon=lex_spec) + senses_link = request.url_for('senses', word=w.id, lexicon=lex_spec) d.update({ 'relationships': { - 'senses': {'data': [dict(type='sense', id=s.id) for s in senses]}, + 'senses': {'links': {'related': senses_link}}, 'synsets': {'data': [dict(type='synset', id=ss.id) for ss in synsets]}, 'lexicon': {'links': {'related': lex_link}} }, - 'included': ( - [make_sense(s, request, basic=True) for s in senses] - + [make_synset(ss, request, basic=True) for ss in synsets] - ) + 'included': [make_synset(ss, request, basic=True) for ss in synsets] }) return d @@ -146,15 +144,14 @@ def make_sense(s: wn.Sense, request: Request, basic: bool = False) -> dict: w = s.word() ss = s.synset() lex_link = request.url_for('lexicon', lexicon=lex_spec) + word_link = request.url_for('word', word=w.id, lexicon=lex_spec) + synset_link = request.url_for('synset', synset=ss.id, lexicon=lex_spec) relationships: dict = { - 'word': {'data': {'type': 'word', 'id': w.id}}, - 'synset': {'data': {'type': 'synset', 'id': ss.id}}, + 'word': {'links': {'related': word_link}}, + 'synset': {'links': {'related': synset_link}}, 'lexicon': {'links': {'related': lex_link}} } - included = [ - make_word(w, request, basic=True), - make_synset(ss, request, basic=True) - ] + included = [] for relname, slist in s.relations().items(): relationships[relname] = { 'data': [dict(type='sense', id=_s.id) for _s in slist] @@ -178,18 +175,15 @@ def make_synset(ss: wn.Synset, request: Request, basic: bool = False) -> dict: } } if not basic: - senses = ss.senses() words = ss.words() lex_link = request.url_for('lexicon', lexicon=lex_spec) + members_link = request.url_for('senses', synset=ss.id, lexicon=lex_spec) relationships: dict = { - 'members': {'data': [dict(type='sense', id=s.id) for s in senses]}, + 'members': {'links': {'related': members_link}}, 'words': {'data': [dict(type='word', id=w.id) for w in words]}, 'lexicon': {'links': {'related': lex_link}} } - included = ( - [make_sense(s, request, basic=True) for s in senses] - + [make_word(w, request, basic=True) for w in words] - ) + included = [make_word(w, request, basic=True) for w in words] for relname, sslist in ss.relations().items(): relationships[relname] = { 'data': [dict(type='synset', id=_s.id) for _s in sslist] @@ -231,26 +225,32 @@ async def words(request): async def word(request): path_params = request.path_params word = wn.word(path_params['word'], lexicon=path_params['lexicon']) - return JSONResponse({'word': make_word(word, request)}) + return JSONResponse({'data': make_word(word, request)}) @paginate(make_sense) async def senses(request): query = request.query_params - lexicon = request.path_params.get('lexicon') or query.get('lexicon') - _senses = wn.senses( - form=query.get('form'), - pos=query.get('pos'), - lexicon=lexicon, - lang=query.get('lang'), - ) + path = request.path_params + lexicon = path.get('lexicon') or query.get('lexicon') + if 'word' in path: + _senses = wn.word(path['word'], lexicon=lexicon).senses() + elif 'synset' in path: + _senses = wn.synset(path['synset'], lexicon=lexicon).members() + else: + _senses = wn.senses( + form=query.get('form'), + pos=query.get('pos'), + lexicon=lexicon, + lang=query.get('lang'), + ) return {'data': _senses} async def sense(request): path_params = request.path_params sense = wn.sense(path_params['sense'], lexicon=path_params['lexicon']) - return JSONResponse({'sense': make_sense(sense, request)}) + return JSONResponse({'data': make_sense(sense, request)}) @paginate(make_synset) @@ -270,7 +270,7 @@ async def synsets(request): async def synset(request): path_params = request.path_params synset = wn.synset(path_params['synset'], lexicon=path_params['lexicon']) - return JSONResponse({'synset': make_synset(synset, request)}) + return JSONResponse({'data': make_synset(synset, request)}) routes = [ @@ -278,10 +278,12 @@ async def synset(request): Route('/lexicons/{lexicon}', endpoint=lexicon), Route('/lexicons/{lexicon}/words', endpoint=words), Route('/lexicons/{lexicon}/words/{word}', endpoint=word), + Route('/lexicons/{lexicon}/words/{word}/senses', endpoint=senses), Route('/lexicons/{lexicon}/senses', endpoint=senses), Route('/lexicons/{lexicon}/senses/{sense}', endpoint=sense), Route('/lexicons/{lexicon}/synsets', endpoint=synsets), Route('/lexicons/{lexicon}/synsets/{synset}', endpoint=synset), + Route('/lexicons/{lexicon}/synsets/{synset}/members', endpoint=senses), Route('/words', endpoint=words), Route('/senses', endpoint=senses), Route('/synsets', endpoint=synsets), From 77feb41e1aa296d936a7957f0f995b957096ae63 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 31 May 2021 14:38:09 +0800 Subject: [PATCH 29/39] wn.web: Only included relevant pagination links --- wn/web.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/wn/web.py b/wn/web.py index 1cecd56..5960c95 100644 --- a/wn/web.py +++ b/wn/web.py @@ -26,20 +26,22 @@ async def _paginate_wrapper(request: Request) -> JSONResponse: obj = await func(request) total = len(obj['data']) - last = (total//limit)*limit prev = max(0, offset - limit) next = offset + limit + last = (total//limit)*limit - links = { - 'first': replace_query_params(url, **{'page[offset]': 0}), - 'last': replace_query_params(url, **{'page[offset]': last}), - 'prev': replace_query_params(url, **{'page[offset]': prev}), - 'next': replace_query_params(url, **{'page[offset]': next}), - } - obj['data'] = [proto(x, request) - for x in obj['data'][offset:offset+limit]] + obj['data'] = [proto(x, request) for x in obj['data'][offset:offset+limit]] obj.setdefault('meta', {}).update(total=total) - obj.setdefault('links', {}).update(links) + + links = {} + if offset > 0: + links['first'] = replace_query_params(url, **{'page[offset]': 0}) + links['prev'] = replace_query_params(url, **{'page[offset]': prev}) + if next < total: + links['next'] = replace_query_params(url, **{'page[offset]': next}) + links['last'] = replace_query_params(url, **{'page[offset]': last}) + if links: + obj.setdefault('links', {}).update(links) return JSONResponse(obj) From 23158c0ad624fc5fc2aebf50df02f5fa7a5809a7 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 31 May 2021 14:38:45 +0800 Subject: [PATCH 30/39] Fix some Furo/Sphinx dark-mode issues --- docs/conf.py | 2 +- docs/requirements.txt | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 14b492f..d94b404 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -82,7 +82,7 @@ html_logo = "_static/wn-logo.svg" pygments_style = 'manni' -pygments_dark_style = 'manni' +pygments_dark_style = 'monokai' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/requirements.txt b/docs/requirements.txt index cc8b60d..dd976fe 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx +sphinx>=3.5,<4 furo sphinx-copybutton . diff --git a/pyproject.toml b/pyproject.toml index ce14232..dbe944f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ test = [ "nox", ] doc = [ - "sphinx", + "sphinx>=3.5,<4", "furo", "sphinx-copybutton", ] From af9852d645ca53d1e5379b01d1458486a79dd531 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 31 May 2021 14:39:11 +0800 Subject: [PATCH 31/39] Documentation for wn.web --- docs/api/wn.constants.rst | 2 + docs/api/wn.web.rst | 438 ++++++++++++++++++++++++++++++++++++++ docs/guides/basic.rst | 4 +- docs/index.rst | 1 + wn/web.py | 2 + 5 files changed, 446 insertions(+), 1 deletion(-) create mode 100644 docs/api/wn.web.rst diff --git a/docs/api/wn.constants.rst b/docs/api/wn.constants.rst index 2112965..bc8420c 100644 --- a/docs/api/wn.constants.rst +++ b/docs/api/wn.constants.rst @@ -140,6 +140,8 @@ Sense Relations - ``other`` +.. _parts-of-speech: + Parts of Speech --------------- diff --git a/docs/api/wn.web.rst b/docs/api/wn.web.rst new file mode 100644 index 0000000..c93eb95 --- /dev/null +++ b/docs/api/wn.web.rst @@ -0,0 +1,438 @@ + +wn.web +====== + +.. automodule:: wn.web + +This module provides a RESTful API with `JSON:API`_ responses to +queries against a Wn database. This API implements the primary queries +of the Python API (see :ref:`primary-queries`). For instance, to +search all words in the ``ewn:2020`` lexicon with the form *jet* and +part-of-speech *v*, we can perform the following query:: + + /lexicons/ewn:2020/words?form=jet&pos=v + +This query would return the following response (abbreviated): + +.. code-block:: javascript + + { + "data": [ + { + "id": "ewn-jet-v", + "type": "word", + "attributes": { + "pos": "v", + "lemma": "jet", + "forms": ["jet", "jetted", "jetting"] + }, + "links": { + "self": "http://example.com/lexicons/ewn:2020/words/ewn-jet-v" + }, + "relationships": { + "senses": { + "links": {"related": "http://example.com/lexicons/ewn:2020/words/ewn-jet-v/senses"} + }, + "synsets": { + "data": [ + {"type": "synset", "id": "ewn-01518922-v"}, + {"type": "synset", "id": "ewn-01946093-v"} + ] + }, + "lexicon": { + "links": {"related": "http://example.com/lexicons/ewn:2020"} + } + }, + "included": [ + { + "id": "ewn-01518922-v", + "type": "synset", + "attributes": {"pos": "v", "ili": "i29306"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-01518922-v"} + }, + { + "id": "ewn-01946093-v", + "type": "synset", + "attributes": {"pos": "v", "ili": "i31432"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-01946093-v"} + } + ] + } + ], + "meta": {"total": 1} + } + +Currently, only ``GET`` requests are handled. + +.. _JSON\:API: https://jsonapi.org + + +Installing Dependencies +----------------------- + +By default, Wn does not install the requirements needed for this +module. Install them with the ``[web]`` extra: + +.. code-block:: bash + + $ pip install wn[web] + + +Running and Deploying the Server +-------------------------------- + +This module does not provide an ASGI server, so one will need to be +installed and ran separately. Any ASGI-compliant server should +work. + +For example, the `Uvicorn `_ server may be +used directly (preferably with the ``--reload`` option for hot +reloading) for local development: + +.. code-block:: bash + + $ uvicorn --reload wn.web:app + +For production, see Uvicorn's `documentation about deployment +`_. + + +Requests +-------- + +API Endpoints +''''''''''''' + +The module provides the following endpoints: + +.. table:: + :width: 100% + + ====================================== ======================================================== + Endpoint Description + ====================================== ======================================================== + ``/words`` List words in all available lexicons + ``/senses`` List senses in all available lexicons + ``/synsets`` List synsets in all available lexicons + ``/lexicons`` List available lexicons + ``/lexicons/:lex`` Get lexicon with specifier ``:lex`` + ``/lexicons/:lex/words`` List words for lexicon with specifier ``:lex`` + ``/lexicons/:lex/words/:id/senses`` List senses for word ``:id`` in lexicon ``:lex`` + ``/lexicons/:lex/words/:id`` Get word with ID ``:id`` in lexicon ``:lex`` + ``/lexicons/:lex/senses`` List senses for lexicon with specifier ``:lex`` + ``/lexicons/:lex/senses/:id`` Get sense with ID ``:id`` in lexicon ``:lex`` + ``/lexicons/:lex/synsets`` List synsets for lexicon with specifier ``:lex`` + ``/lexicons/:lex/synsets/:id`` Get synset with ID ``:id`` in lexicon ``:lex`` + ``/lexicons/:lex/synsets/:id/members`` Get member senses for synset ``:id`` in lexicon ``:lex`` + ====================================== ======================================================== + + +Query Parameters +'''''''''''''''' + +``lang`` +~~~~~~~~ + +Specifies the language in `BCP 47`_ of the lexicon(s) from which +results are returned. + +.. _BCP 47: https://en.wikipedia.org/wiki/IETF_language_tag + +Example:: + + /words?lang=fr + +Valid for:: + + /lexicons + /words + /senses + /synsets + +``form`` +~~~~~~~~ + +Specifies the word form of the objects that are returned. + +Example:: + + /words?form=chat + +Valid for:: + + /words + /senses + /synsets + /lexicon/:lex/words + /lexicon/:lex/senses + /lexicon/:lex/synsets + +``pos`` +~~~~~~~ + +Specifies the part-of-speech of the objects that are returned. Valid +values are given in :ref:`parts-of-speech`. + +Example:: + + /words?pos=v + +Valid for:: + + /words + /senses + /synsets + /lexicon/:lex/words + /lexicon/:lex/senses + /lexicon/:lex/synsets + +``ili`` +~~~~~~~ + +Specifies the interlingual index of a synset. + +Example:: + + /synsets?ili=i57031 + +Valid for:: + + /synsets + /lexicon/:lex/synsets + +``page[offset]`` and ``page[limit]`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Used for pagination: ``page[offset]`` specifies the starting index of +a set of results, and ``page[limit]`` specifies how many results from +the offset will be returned. + +Example:: + + /words?page[offset]=150 + +Valid for:: + + /words + /senses + /synsets + /lexicon/:lex/words + /lexicon/:lex/senses + /lexicon/:lex/synsets + + +Responses +--------- + +Responses are JSON data following the `JSON:API`_ specification. A +full description of JSON:API is left to the linked specification, but +a brief walkthrough is provided here. First, the top-level structure +of "to-one" responses (e.g., getting a single synset) is: + +.. code-block:: javascript + + { + "data": { ... }, // primary response data as a JSON object + "meta": { ... } // metadata for the response + } + +For "to-many" responses (e.g., getting a list of matching synsets), it +is the same as above except the ``data`` key maps to an array and it +includes pagination links: + +.. code-block:: javascript + + { + "data": [{ ... }, ...], // primary response data as an array of objects + "links": { ... }, // pagination links + "meta": { ... } // metadata; e.g., total number of results + } + + +Resource Objects +'''''''''''''''' + +Each JSON:API *resource object* (the primary data given by the +``data`` key) has the following structure: + +.. code-block:: javascript + + { + "id": "...", // Lexicon specifier or entity ID + "type": "...", // "lexicon", "word", "sense", or "synset" + "attributes": { ... }, // Basic resource information + "links": { "self": ... }, // URL for this specific resource + "relationships": { ... }, // Word senses, synset members, other relations + "included": [ ... ], // Data for related resources + } + + +Lexicons +~~~~~~~~ + +.. code-block:: javascript + + { + "id": "ewn:2020", + "type": "lexicon", + "attributes": { + "version": "2020", + "label": "English WordNet", + "language": "en", + "license": "https://creativecommons.org/licenses/by/4.0/" + }, + "links": {"self": "http://example.com/lexicons/ewn:2020"}, + "relationships": { + "words": {"links": {"related": "http://example.com/lexicons/ewn:2020/words"}}, + "synsets": {"links": {"related": "http://example.com/lexicons/ewn:2020/synsets"}}, + "senses": {"links": {"related": "http://example.com/lexicons/ewn:2020/senses"}} + } + } + + +Words +~~~~~ + +.. code-block:: javascript + + { + "id": "ewn-brick-v", + "type": "word", + "attributes": {"pos": "v", "lemma": "brick", "forms": ["brick"]}, + "links": {"self": "http://example.com/lexicons/ewn:2020/words/ewn-brick-v"}, + "relationships": { + "senses": {"links": {"related": "http://example.com/lexicons/ewn:2020/words/ewn-brick-v/senses"}}, + "synsets": {"data": [{"type": "synset", "id": "ewn-90011761-v"}]}, + "lexicon": {"links": {"related": "http://example.com/lexicons/ewn:2020"}} + }, + "included": [ + { + "id": "ewn-90011761-v", + "type": "synset", + "attributes": {"pos": "v", "ili": null}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-90011761-v"} + } + ] + } + + +Senses +~~~~~~ + +.. code-block:: javascript + + { + "id": "ewn-explain-v-00941308-01", + "type": "sense", + "links": {"self": "http://example.com/lexicons/ewn:2020/senses/ewn-explain-v-00941308-01"}, + "relationships": { + "word": {"links": {"related": "http://example.com/lexicons/ewn:2020/words/ewn-explain-v"}}, + "synset": {"links": {"related": "http://example.com/lexicons/ewn:2020/synsets/ewn-00941308-v"}}, + "lexicon": {"links": {"related": "http://example.com/lexicons/ewn:2020"}}, + "derivation": { + "data": [ + {"type": "sense", "id": "ewn-explanatory-s-01327635-01"}, + {"type": "sense", "id": "ewn-explanation-n-07247081-01"} + ] + } + }, + "included": [ + { + "id": "ewn-explanatory-s-01327635-01", + "type": "sense", + "links": {"self": "http://example.com/lexicons/ewn:2020/senses/ewn-explanatory-s-01327635-01"} + }, + { + "id": "ewn-explanation-n-07247081-01", + "type": "sense", + "links": {"self": "http://example.com/lexicons/ewn:2020/senses/ewn-explanation-n-07247081-01"} + } + ] + } + + +Synsets +~~~~~~~ + +.. code-block:: javascript + + { + "id": "ewn-03204585-n", + "type": "synset", + "attributes": {"pos": "n", "ili": "i52917"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-03204585-n"}, + "relationships": { + "members": {"links": {"related": "http://example.com/lexicons/ewn:2020/synsets/ewn-03204585-n/members"}}, + "words": { + "data": [ + {"type": "word", "id": "ewn-dory-n"}, + {"type": "word", "id": "ewn-rowboat-n"}, + {"type": "word", "id": "ewn-dinghy-n"} + ] + }, + "lexicon": {"links": {"related": "http://example.com/lexicons/ewn:2020"}}, + "hypernym": {"data": [{"type": "synset", "id": "ewn-04252125-n"}]}, + "mero_part": { + "data": [ + {"type": "synset", "id": "ewn-03911849-n"}, + {"type": "synset", "id": "ewn-04439177-n"} + ] + }, + "hyponym": { + "data": [ + {"type": "synset", "id": "ewn-04122550-n"}, + {"type": "synset", "id": "ewn-04584425-n"} + ] + } + }, + "included": [ + { + "id": "ewn-dory-n", + "type": "word", + "attributes": {"pos": "n", "lemma": "dory", "forms": ["dory"]}, + "links": {"self": "http://example.com/lexicons/ewn:2020/words/ewn-dory-n"} + }, + { + "id": "ewn-rowboat-n", + "type": "word", + "attributes": {"pos": "n", "lemma": "rowboat", "forms": ["rowboat"]}, + "links": {"self": "http://example.com/lexicons/ewn:2020/words/ewn-rowboat-n"} + }, + { + "id": "ewn-dinghy-n", + "type": "word", + "attributes": {"pos": "n", "lemma": "dinghy", "forms": ["dinghy"]}, + "links": {"self": "http://example.com/lexicons/ewn:2020/words/ewn-dinghy-n"} + }, + { + "id": "ewn-04252125-n", + "type": "synset", + "attributes": {"pos": "n", "ili": "i59107"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-04252125-n"} + }, + { + "id": "ewn-03911849-n", + "type": "synset", + "attributes": {"pos": "n", "ili": "i57094"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-03911849-n"} + }, + { + "id": "ewn-04439177-n", + "type": "synset", + "attributes": {"pos": "n", "ili": "i60240"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-04439177-n"} + }, + { + "id": "ewn-04122550-n", + "type": "synset", + "attributes": {"pos": "n", "ili": "i58319"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-04122550-n"} + }, + { + "id": "ewn-04584425-n", + "type": "synset", + "attributes": {"pos": "n", "ili": "i61103"}, + "links": {"self": "http://example.com/lexicons/ewn:2020/synsets/ewn-04584425-n"} + } + ] + } diff --git a/docs/guides/basic.rst b/docs/guides/basic.rst index 2bb57b2..14214b3 100644 --- a/docs/guides/basic.rst +++ b/docs/guides/basic.rst @@ -50,6 +50,8 @@ In fact, the simple queries above implicitly create such a lexicons. +.. _primary-queries: + Primary Queries --------------- @@ -226,7 +228,7 @@ Filtering by Language --------------------- The ``lang`` parameter of :func:`wn.words()`, :func:`wn.senses()`, -:func:`wn.synsets()`, and :class:`~wn.Wordnet` allows a single `BCP-47 +:func:`wn.synsets()`, and :class:`~wn.Wordnet` allows a single `BCP 47 `_ language code. When this parameter is used, only entries in the specified language will be returned. diff --git a/docs/index.rst b/docs/index.rst index f4b3e19..cd23e6d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -62,3 +62,4 @@ Contents api/wn.project.rst api/wn.similarity.rst api/wn.util.rst + api/wn.web.rst diff --git a/wn/web.py b/wn/web.py index 5960c95..adca6f7 100644 --- a/wn/web.py +++ b/wn/web.py @@ -1,4 +1,6 @@ +"""Web interface for Wn databases.""" + from typing import Optional, Union from functools import wraps from urllib.parse import urlsplit, parse_qs, urlencode From 6818860c978d8b961d81d6f39f7faaff314bce72 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 31 May 2021 14:43:07 +0800 Subject: [PATCH 32/39] Add forgotten CHANGELOG entries Resolve #82 Resolve #116 --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4014d6..a42661a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ only by instantiating a `wn.Wordnet` object ([#105]) * `wn.morphy` ([#19]) * `wn.Wordnet.lemmatizer` attribute ([#8]) +* `wn.web` ([#116]) +* `wn.Sense.relations()` ([#82]) +* `wn.Synset.relations()` ([#82]) ### Changed @@ -379,6 +382,7 @@ abandoned, but this is an entirely new codebase. [#78]: https://github.com/goodmami/wn/issues/78 [#79]: https://github.com/goodmami/wn/issues/79 [#81]: https://github.com/goodmami/wn/issues/81 +[#82]: https://github.com/goodmami/wn/issues/82 [#83]: https://github.com/goodmami/wn/issues/83 [#86]: https://github.com/goodmami/wn/issues/86 [#87]: https://github.com/goodmami/wn/issues/87 @@ -393,4 +397,5 @@ abandoned, but this is an entirely new codebase. [#105]: https://github.com/goodmami/wn/issues/105 [#106]: https://github.com/goodmami/wn/issues/106 [#108]: https://github.com/goodmami/wn/issues/108 +[#116]: https://github.com/goodmami/wn/issues/116 [#117]: https://github.com/goodmami/wn/issues/117 From c4a581d41aad67b8f3489f952930291645c8796f Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 7 Jun 2021 23:00:23 +0800 Subject: [PATCH 33/39] Restructure documentation hierarchhy Project-level "guides" are now top-level in the documentation. --- docs/api/wn.rst | 2 +- docs/{guides => }/cli.rst | 0 docs/index.rst | 4 ++-- docs/{guides => }/setup.rst | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename docs/{guides => }/cli.rst (100%) rename docs/{guides => }/setup.rst (98%) diff --git a/docs/api/wn.rst b/docs/api/wn.rst index 242c5c1..3084b65 100644 --- a/docs/api/wn.rst +++ b/docs/api/wn.rst @@ -286,7 +286,7 @@ Wn's data storage and retrieval can be configured through the .. seealso:: - :doc:`../guides/setup` describes how to configure Wn using the + :doc:`../setup` describes how to configure Wn using the :data:`wn.config` instance. .. autodata:: config diff --git a/docs/guides/cli.rst b/docs/cli.rst similarity index 100% rename from docs/guides/cli.rst rename to docs/cli.rst diff --git a/docs/index.rst b/docs/index.rst index cd23e6d..b28f248 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -36,14 +36,14 @@ Contents .. toctree:: :maxdepth: 2 + setup.rst + cli.rst faq.rst .. toctree:: :caption: Guides :maxdepth: 2 - guides/setup.rst - guides/cli.rst guides/lexicons.rst guides/basic.rst guides/interlingual.rst diff --git a/docs/guides/setup.rst b/docs/setup.rst similarity index 98% rename from docs/guides/setup.rst rename to docs/setup.rst index d14642a..0655701 100644 --- a/docs/guides/setup.rst +++ b/docs/setup.rst @@ -4,7 +4,7 @@ Installation and Configuration .. seealso:: This guide is for installing and configuring the Wn software. For - adding lexicons to the database, see :doc:`lexicons`. + adding lexicons to the database, see :doc:`guides/lexicons`. Installation From 0403193d5a39d700e0c1b1fe2bf2a5973925a9c6 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 9 Jun 2021 19:11:05 +0800 Subject: [PATCH 34/39] Add lemmatization and normalization docs --- docs/api/wn.morphy.rst | 7 +- docs/guides/lemmatization.rst | 269 ++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 docs/guides/lemmatization.rst diff --git a/docs/api/wn.morphy.rst b/docs/api/wn.morphy.rst index def3ab4..5675e28 100644 --- a/docs/api/wn.morphy.rst +++ b/docs/api/wn.morphy.rst @@ -7,8 +7,11 @@ wn.morphy .. seealso:: The Princeton WordNet `documentation - `_ for the - original implementation of Morphy. + `_ describes + the original implementation of Morphy. + + The :doc:`../guides/lemmatization` guide describes how Wn handles + lemmatization in general. Initialized and Uninitialized Morphy diff --git a/docs/guides/lemmatization.rst b/docs/guides/lemmatization.rst new file mode 100644 index 0000000..c111b7d --- /dev/null +++ b/docs/guides/lemmatization.rst @@ -0,0 +1,269 @@ + +Lemmatization and Normalization +=============================== + +Wn provides two methods for expanding queries: lemmatization_ and +normalization_\ . Wn also has a setting that allows `alternative forms +`_ stored in the database to be included in +queries. + +.. seealso:: + + The :mod:`wn.morphy` module is a basic English lemmatizer included + with Wn. + +.. _lemmatization: + +Lemmatization +------------- + +When querying a wordnet with wordforms from natural language text, it +is important to be able to find entries for inflected forms as the +database generally contains only lemmatic forms, or *lemmas* (or +*lemmata*, if you prefer irregular plurals). + +>>> import wn +>>> ewn = wn.Wordnet('ewn:2020') +>>> ewn.words('plurals') # no results +[] +>>> ewn.words('plural') +[Word('ewn-plural-n'), Word('ewn-plural-a')] + +Lemmas are sometimes called *citation forms* or *dictionary forms* as +they are often used as the head words in dictionary entries. In +Natural Language Processing (NLP), *lemmatization* is a technique +where a possibly inflected word form is transformed to yield a +lemma. In Wn, this concept is generalized somewhat to mean a +transformation that yields a form matching wordforms stored in the +database. For example, the English word *sparrows* is the plural +inflection of *sparrow*, while the word *leaves* is ambiguous between +the plural inflection of the nouns *leaf* and *leave* and the +3rd-person singular inflection of the verb *leave*. + +For tasks where high-accuracy is needed, wrapping the wordnet queries +with external tools that handle tokenization, lemmatization, and +part-of-speech tagging will likely yield the best results as this +method can make use of word context. That is, something like this: + +.. code-block:: python + + for lemma, pos in fancy_shmancy_analysis(corpus): + synsets = w.synsets(lemma, pos=pos) + +For modest needs, however, Wn provides a way to integrate basic +lemmatization directly into the queries. + +Lemmatization in Wn works as follows: if a :class:`wn.Wordnet` object +is instantiated with a *lemmatizer* argument, then queries involving +wordforms (e.g., :meth:`wn.Wordnet.words`, :meth:`wn.Wordnet.senses`, +:meth:`wn.Wordnet.synsets`) will first lemmatize the wordform and then +check all resulting wordforms and parts of speech against the +database as successive queries. + +Lemmatization Functions +''''''''''''''''''''''' + +The *lemmatizer* argument of :class:`wn.Wordnet` is a callable that +takes two string arguments: (1) the original wordform, and (2) a +part-of-speech or :python:`None`. It returns a dictionary mapping +parts-of-speech to sets of lemmatized wordforms. The signature is as +follows: + +.. code-block:: python + + lemmatizer(s: str, pos: Optional[str]) -> Dict[Optional[str], Set[str]] + +The part-of-speech may be used by the function to determine which +morphological rules to apply. If the given part-of-speech is +:python:`None`, then it is not specified and any rule may apply. A +lemmatizer that only deinflects should not change any specified +part-of-speech, but this is not a requirement, and a function could be +provided that undoes derivational morphology (e.g., *democratic* → +*democracy*). + +Querying With Lemmatization +''''''''''''''''''''''''''' + +As the needs of lemmatization differs from one language to another, Wn +does not provide a lemmatizer by default, and therefore it is +unavailable to the convenience functions :func:`wn.words`, +:func:`wn.senses`, and :func:`wn.synsets`. A lemmatizer can be added +to a :class:`wn.Wordnet` object. For example, using :mod:`wn.morphy`: + +>>> import wn +>>> from wn.morphy import Morphy +>>> ewn = wn.Wordnet('ewn:2020', lemmatizer=Morphy()) +>>> ewn.words('sparrows') +[Word('ewn-sparrow-n')] +>>> ewn.words('leaves') +[Word('ewn-leaf-n'), Word('ewn-leave-n'), Word('ewn-leave-v')] + + +Querying Without Lemmatization +'''''''''''''''''''''''''''''' + +When lemmatization is not used, inflected terms may not return any +results: + +>>> ewn = wn.Wordnet('ewn:2020') +>>> ewn.words('sparrows') +[] +>>> ewn.words('leaves') +[] + +Depending on the lexicon, there may be situations where results are +returned for inflected lemmas, such as when the inflected form is +lexicalized as its own entry: + +>>> ewn.words('glasses') +[Word('ewn-glasses-n')] + +Or if the lexicon lists the inflected form as an alternative form. For +example, the English Wordnet lists irregular inflections as +alternative forms: + +>>> ewn.words('lemmata') +[Word('ewn-lemma-n')] + +See below for excluding alternative forms from such queries. + +.. _alternative-forms: + +Alternative Forms in the Database +--------------------------------- + +A lexicon may include alternative forms in addition to lemmas for each +word, and by default these are included in queries. What exactly is +included as an alternative form depends on the lexicon. The English +Wordnet, for example, adds irregular inflections (or "exceptional +forms"), while the Japanese Wordnet includes the same word in multiple +orthographies (original, hiragana, katakana, and two romanizations). +For the English Wordnet, this means that you might get basic +lemmatization for irregular forms only: + +>>> ewn = wn.Wordnet('ewn:2020') +>>> ewn.words('learnt', pos='v') +[Word('ewn-learn-v')] +>>> ewn.words('learned', pos='v') +[] + +If this is undesirable, the alternative forms can be excluded from +queries with the *search_all_forms* parameter: + +>>> ewn = wn.Wordnet('ewn:2020', search_all_forms=False) +>>> ewn.words('learnt', pos='v') +[] +>>> ewn.words('learned', pos='v') +[] + + +.. _normalization: + +Normalization +------------- + +While lemmatization deals with morphological variants of words, +normalization handles minor orthographic variants. Normalized forms, +however, may be invalid as wordforms in the target language, and as +such they are only used behind the scenes for query expansion and not +presented to users. For instance, a user might attempt to look up +*résumé* in the English wordnet, but the wordnet only contains the +form without diacritics: *resume*. With strict string matching, the +entry would not be found using the wordform in the query. By +normalizing the query word, the entry can be found. Similarly in the +Spanish wordnet, *año* (year) and *ano* (anus) are two different +words. A user who types *año* likely does not want to get results for +*ano*, but one who types *ano* may be a non-Spanish speaker who is +unaware of the missing diacritic or does not have an input method that +allows them to type the diacritic, so this query would return both +entries by matching against the normalized forms in the database. Wn +handles all of these use cases. + +When a lexicon is added to the database, potentially two wordforms are +inserted for every one in the lexicon: the original wordform and a +normalized form. When querying against the database, the original +query string is first compared with the original wordforms and, if +normalization is enabled, with the normalized forms in the database as +well. If this first attempt yields no results and if normalization is +enabled, the query string is normalized and tried again. + +Normalization Functions +''''''''''''''''''''''' + +The normalized form is obtained from a *normalizer* function, passed +as an argument to :class:`wn.Wordnet`, that takes a single string +argument and returns a string. That is, a function with the following +signature: + +.. code-block:: python + + normalizer(s: str) -> str + +While custom *normalizer* functions could be used, in practice the +choice is either the default normalizer or :python:`None`. The default +normalizer works by downcasing the string and performing NFKD_ +normalization to remove diacritics. If the normalized form is the same +as the original, only the original is inserted into the database. + +.. table:: Examples of normalization + :align: center + + ============= =============== + Original Form Normalized Form + ============= =============== + résumé resume + año ano + San José san jose + ハラペーニョ ハラヘーニョ + ============= =============== + +.. _NFKD: https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms + +Querying With Normalization +''''''''''''''''''''''''''' + +By default, normalization is enabled when a :class:`wn.Wordnet` is +created. Enabling normalization does two things: it allows queries to +check the original wordform in the query against the normalized forms +in the database and, if no results are returned in the first step, it +allows the queried wordform to be normalized as a back-off technique. + +>>> ewn = wn.Wordnet('ewn:2020') +>>> ewn.words('résumé') +[Word('ewn-resume-v'), Word('ewn-resume-n')] +>>> spa = wn.Wordnet('spawn:1.3+omw') +>>> spa.words('año') +[Word('spawn-lex57514')] +>>> spa.words('ano') +[Word('spawn-lex34109'), Word('spawn-lex57514')] + +.. note:: + + Users may supply a custom *normalizer* function to the + :class:`wn.Wordnet` object, but currently this is discouraged as + the result is unlikely to match normalized forms in the database + and there is not yet a way to customize the normalization of forms + added to the database. + +Querying Without Normalization +'''''''''''''''''''''''''''''' + +Normalization can be disabled by passing :python:`None` as the +argument of the *normalizer* parameter of :class:`wn.Wordnet`. The +queried wordform will not be checked against normalized forms in the +database and neither will it be normalized as a back-off technique. + +>>> ewn = wn.Wordnet('ewn:2020', normalizer=None) +>>> ewn.words('résumé') +[] +>>> spa = wn.Wordnet('spawn:1.3+omw', normalizer=None) +>>> spa.words('año') +[Word('spawn-lex57514')] +>>> spa.words('ano') +[Word('spawn-lex34109')] + +.. note:: + + It is not possible to disable normalization for the convenience + functions :func:`wn.words`, :func:`wn.senses`, and + :func:`wn.synsets`. diff --git a/docs/index.rst b/docs/index.rst index b28f248..6156b4b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -48,6 +48,7 @@ Contents guides/basic.rst guides/interlingual.rst guides/wordnet.rst + guides/lemmatization.rst guides/nltk-migration.rst .. toctree:: From 0640a230a0f3e13bb61fefd86dbefcca3d221b15 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 9 Jun 2021 19:11:37 +0800 Subject: [PATCH 35/39] Partially enable smartquotes in docs --- docs/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index d94b404..1635b79 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,7 +58,8 @@ :class: highlight """ -smartquotes = False +# smartquotes = False +smartquotes_action = 'De' # D = en- and em-dash; e = ellipsis # -- Options for HTML output ------------------------------------------------- From 55a2c74e700a66e20a9129756719172fe2faa81c Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 9 Jun 2021 19:16:28 +0800 Subject: [PATCH 36/39] Add missing CHANGELOG entry Fixes #115 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a42661a..70d0b23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ * `wn.Wordnet` - Initialization now takes a `normalizer` parameter ([#105]) - Initialization now takes a `lemmatizer` parameter ([#8]) + - Initialization now takes a `search_all_forms` parameter ([#115]) - `Wordnet.words()`, `Wordnet.senses()` and `Wordnet.synsets()` now use any specified lemmatization or normalization functions to expand queries on word forms ([#105]) @@ -397,5 +398,6 @@ abandoned, but this is an entirely new codebase. [#105]: https://github.com/goodmami/wn/issues/105 [#106]: https://github.com/goodmami/wn/issues/106 [#108]: https://github.com/goodmami/wn/issues/108 +[#115]: https://github.com/goodmami/wn/issues/115 [#116]: https://github.com/goodmami/wn/issues/116 [#117]: https://github.com/goodmami/wn/issues/117 From 591b7597ad82549e8485aace9b78721d6a5c65ef Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 9 Jun 2021 19:36:20 +0800 Subject: [PATCH 37/39] Update docs for wn.web --- docs/api/wn.web.rst | 39 ++++++++++++++++----------------------- docs/setup.rst | 11 +++++++++-- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/api/wn.web.rst b/docs/api/wn.web.rst index c93eb95..18e199a 100644 --- a/docs/api/wn.web.rst +++ b/docs/api/wn.web.rst @@ -12,7 +12,7 @@ part-of-speech *v*, we can perform the following query:: /lexicons/ewn:2020/words?form=jet&pos=v -This query would return the following response (abbreviated): +This query would return the following response: .. code-block:: javascript @@ -86,8 +86,8 @@ installed and ran separately. Any ASGI-compliant server should work. For example, the `Uvicorn `_ server may be -used directly (preferably with the ``--reload`` option for hot -reloading) for local development: +used directly for local development, optionally with the ``--reload`` +option for hot reloading: .. code-block:: bash @@ -97,11 +97,8 @@ For production, see Uvicorn's `documentation about deployment `_. -Requests --------- - -API Endpoints -''''''''''''' +Requests: API Endpoints +----------------------- The module provides the following endpoints: @@ -127,11 +124,11 @@ The module provides the following endpoints: ====================================== ======================================================== -Query Parameters -'''''''''''''''' +Requests: Query Parameters +-------------------------- ``lang`` -~~~~~~~~ +'''''''' Specifies the language in `BCP 47`_ of the lexicon(s) from which results are returned. @@ -150,7 +147,7 @@ Valid for:: /synsets ``form`` -~~~~~~~~ +'''''''' Specifies the word form of the objects that are returned. @@ -168,7 +165,7 @@ Valid for:: /lexicon/:lex/synsets ``pos`` -~~~~~~~ +''''''' Specifies the part-of-speech of the objects that are returned. Valid values are given in :ref:`parts-of-speech`. @@ -187,7 +184,7 @@ Valid for:: /lexicon/:lex/synsets ``ili`` -~~~~~~~ +''''''' Specifies the interlingual index of a synset. @@ -201,7 +198,7 @@ Valid for:: /lexicon/:lex/synsets ``page[offset]`` and ``page[limit]`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +'''''''''''''''''''''''''''''''''''' Used for pagination: ``page[offset]`` specifies the starting index of a set of results, and ``page[limit]`` specifies how many results from @@ -248,10 +245,6 @@ includes pagination links: "meta": { ... } // metadata; e.g., total number of results } - -Resource Objects -'''''''''''''''' - Each JSON:API *resource object* (the primary data given by the ``data`` key) has the following structure: @@ -268,7 +261,7 @@ Each JSON:API *resource object* (the primary data given by the Lexicons -~~~~~~~~ +'''''''' .. code-block:: javascript @@ -291,7 +284,7 @@ Lexicons Words -~~~~~ +''''' .. code-block:: javascript @@ -317,7 +310,7 @@ Words Senses -~~~~~~ +'''''' .. code-block:: javascript @@ -352,7 +345,7 @@ Senses Synsets -~~~~~~~ +''''''' .. code-block:: javascript diff --git a/docs/setup.rst b/docs/setup.rst index 0655701..9fcff25 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -7,8 +7,8 @@ Installation and Configuration adding lexicons to the database, see :doc:`guides/lexicons`. -Installation ------------- +Installing from PyPI +-------------------- Install the latest release from `PyPI `_: @@ -16,6 +16,13 @@ Install the latest release from `PyPI `_: pip install wn +To get the dependencies for the :mod:`wn.web` module, use the ``web`` +installation extra: + +.. code-block:: bash + + pip install wn[web] + The Data Directory ------------------ From be74a7f8f4dbb38bff22347c897360961ef88b93 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 9 Jun 2021 19:40:08 +0800 Subject: [PATCH 38/39] Bump version for 0.7.0 --- CHANGELOG.md | 5 +++++ wn/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70d0b23..8537634 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +## [v0.7.0] + +**Release date: 2021-06-09** + ### Added * Support for approximate word searches; on by default, configurable @@ -343,6 +347,7 @@ the https://github.com/nltk/wordnet/ code which had been effectively abandoned, but this is an entirely new codebase. +[v0.7.0]: ../../releases/tag/v0.7.0 [v0.6.2]: ../../releases/tag/v0.6.2 [v0.6.1]: ../../releases/tag/v0.6.1 [v0.6.0]: ../../releases/tag/v0.6.0 diff --git a/wn/__init__.py b/wn/__init__.py index 2fe67e7..6135dbc 100644 --- a/wn/__init__.py +++ b/wn/__init__.py @@ -49,4 +49,4 @@ Wordnet ) -__version__ = '0.6.2' +__version__ = '0.7.0' From 9ed1502011308331738c985864893140c2f4127b Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 9 Jun 2021 19:55:22 +0800 Subject: [PATCH 39/39] Add typing stubs for mypy checks --- noxfile.py | 3 +-- pyproject.toml | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/noxfile.py b/noxfile.py index 7e09b74..8580878 100644 --- a/noxfile.py +++ b/noxfile.py @@ -3,8 +3,7 @@ @nox.session def lint(session): - session.install('flake8', 'mypy') - session.install('.') + session.install('.[test]') session.run('flake8', '--max-line-length', '88', 'wn') session.run('mypy', 'wn') diff --git a/pyproject.toml b/pyproject.toml index dbe944f..e913130 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,9 @@ test = [ "mypy", "flake8", "nox", + # typing stubs + 'types-requests', + 'types-toml', ] doc = [ "sphinx>=3.5,<4",