From f8fe78d98523bc92e7a24d2f014f967189e62f9e Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 29 Apr 2021 12:52:08 +0800 Subject: [PATCH 01/20] Initial sources for information content --- docs/api/wn.ic.rst | 7 ++++ docs/index.rst | 1 + wn/ic.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 docs/api/wn.ic.rst create mode 100644 wn/ic.py diff --git a/docs/api/wn.ic.rst b/docs/api/wn.ic.rst new file mode 100644 index 0000000..25f72ff --- /dev/null +++ b/docs/api/wn.ic.rst @@ -0,0 +1,7 @@ + +wn.ic +===== + +.. automodule:: wn.ic + +.. autofunction:: compute diff --git a/docs/index.rst b/docs/index.rst index 6156b4b..877b0f4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -58,6 +58,7 @@ Contents api/wn.rst api/wn.constants.rst + api/wn.ic.rst api/wn.lmf.rst api/wn.morphy.rst api/wn.project.rst diff --git a/wn/ic.py b/wn/ic.py new file mode 100644 index 0000000..a75b098 --- /dev/null +++ b/wn/ic.py @@ -0,0 +1,82 @@ + +r"""Information Content + +The Information Content (IC) of a concept (synset) is a measure of its +specificity computed from the wordnet's taxonomy structure and corpus +frequencies. It is defined by Resnik 1995 ([RES95]_), following +information theory, as the negative log-probability of a concept: + +.. math:: + + IC(c) = -\log{p(c)} + +A concept's probability is the empirical probability over a corpus: + +.. math:: + + p(c) = \frac{\text{freq}(c)}{N} + +Here, :math:`N` is the total count of words of the same category as +concept :math:`c` ([RES95]_ only considered nouns) where each word has +some representation in the wordnet, and :math:`\text{freq}` is defined +as the sum of corpus counts of words in :math:`\text{words}(c)`, which +is the set of words subsumed by concept :math:`c`: + +.. math:: + + \text{freq}(c) = \sum_{n \in \text{words}(c)}{\text{count}(n)} + + +.. [RES95] Resnik, Philip. "Using information content to evaluate + semantic similarity." In Proceedings of the 14th International + Joint Conference on Artificial Intelligence (IJCAI-95), Montreal, + Canada, pp. 448-453. 1995. + +""" + +from typing import Iterable, Dict +from collections import Counter + +from wn._core import Synset, Wordnet +from wn.constants import NOUN, VERB, ADJ, ADV, ADJ_SAT + +# Just use a subset of all available parts of speech +IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV)) + + +Corpus = Iterable[str] +IC = Dict[str, Dict[Synset, float]] # {pos: {synset: ic}} + + +def compute( + corpus: Corpus, + wordnet: Wordnet, + distribute_weight: bool = True, + smoothing: float = 1.0 +) -> IC: + + counts = Counter(corpus) + + # intialize with the smoothing value + ic: IC = {pos: {synset: smoothing + for synset in wordnet.synsets(pos=pos)} + for pos in IC_PARTS_OF_SPEECH} + # pretend ADJ_SAT is just ADJ + for synset in wordnet.synsets(pos=ADJ_SAT): + ic[ADJ][synset] = smoothing + + for word, count in counts.items(): + synsets = wordnet.synsets(word) + num = len(synsets) + if num == 0: + continue + weight = count / num if distribute_weight else count + for synset in synsets: + pos = synset.pos + if pos == ADJ_SAT: + pos = ADJ + if pos not in IC_PARTS_OF_SPEECH: + continue + ic[pos][synset] += weight + + return ic From 240eb91dafade391a5abc4ca3f840a98e107300f Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 7 Jun 2021 15:45:03 +0800 Subject: [PATCH 02/20] Fix #119: Add wn.util.synset_id_formatter --- docs/api/wn.util.rst | 2 ++ tests/util_test.py | 10 ++++++++++ wn/util.py | 33 ++++++++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 tests/util_test.py diff --git a/docs/api/wn.util.rst b/docs/api/wn.util.rst index 7246103..8bfde5d 100644 --- a/docs/api/wn.util.rst +++ b/docs/api/wn.util.rst @@ -3,6 +3,8 @@ wn.util .. automodule:: wn.util +.. autofunction:: synset_id_formatter + .. autoclass:: ProgressHandler :members: diff --git a/tests/util_test.py b/tests/util_test.py new file mode 100644 index 0000000..515d01f --- /dev/null +++ b/tests/util_test.py @@ -0,0 +1,10 @@ + +from wn import util + + +def test_synset_id_formatter(): + f = util.synset_id_formatter + assert f()(prefix='xyz', offset=123, pos='n') == 'xyz-00000123-n' + assert f(prefix='xyz')(offset=123, pos='n') == 'xyz-00000123-n' + assert f(prefix='xyz', pos='n')(offset=123) == 'xyz-00000123-n' + assert f('abc-{offset}-{pos}')(offset=1, pos='v') == 'abc-1-v' diff --git a/wn/util.py b/wn/util.py index 2d50acc..afe45ac 100644 --- a/wn/util.py +++ b/wn/util.py @@ -1,8 +1,39 @@ """Wn utility classes.""" -from typing import TextIO +from typing import TextIO, Callable import sys +def synset_id_formatter( + fmt: str = '{prefix}-{offset:08}-{pos}', + **kwargs +) -> Callable: + """Return a function for formatting synset ids. + + The *fmt* argument can be customized. It will be formatted using + any other keyword arguments given to this function and any given + to the resulting function. By default, the format string expects a + ``prefix`` string argument for the namespace (such as a lexicon + id), an ``offset`` integer argument (such as a WNDB offset), and a + ``pos`` string argument. + + Arguments: + fmt: A Python format string + **kwargs: Keyword arguments for the format string. + + Example: + + >>> pwn_synset_id = synset_id_formatter(prefix='pwn') + >>> pwn_synset_id(offset=1174, pos='n') + 'pwn-00001174-n' + + """ + + def format_synset_id(**_kwargs) -> str: + return fmt.format(**kwargs, **_kwargs) + + return format_synset_id + + class ProgressHandler: """An interface for updating progress in long-running processes. From a4aa995ee846d8a00abc1cb00c6c65e66e7507b8 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 7 Jun 2021 15:46:15 +0800 Subject: [PATCH 03/20] Resolve #40: Add wn.ic information content There is some lacking to make this a complete solution, such as documentation explaining how IC is really only valid for a particular version of a wordnet, ways to download the IC files, etc., but this implements functions to load and compute IC weights and to calculate synset probabilities and IC values, and these functions are tested and documented. --- docs/api/wn.ic.rst | 62 +++++++++ tests/data/mini-lmf-1.0.xml | 4 +- tests/ic_test.py | 120 +++++++++++++++++ tests/secondary_query_test.py | 2 +- wn/ic.py | 236 ++++++++++++++++++++++++++-------- 5 files changed, 369 insertions(+), 55 deletions(-) create mode 100644 tests/ic_test.py diff --git a/docs/api/wn.ic.rst b/docs/api/wn.ic.rst index 25f72ff..6dc8aed 100644 --- a/docs/api/wn.ic.rst +++ b/docs/api/wn.ic.rst @@ -4,4 +4,66 @@ wn.ic .. automodule:: wn.ic +Description +----------- + +The Information Content (IC) of a concept (synset) is a measure of its +specificity computed from the wordnet's taxonomy structure and corpus +frequencies. It is defined by Resnik 1995 ([RES95]_), following +information theory, as the negative log-probability of a concept: + +.. math:: + + IC(c) = -\log{p(c)} + +A concept's probability is the empirical probability over a corpus: + +.. math:: + + p(c) = \frac{\text{freq}(c)}{N} + +Here, :math:`N` is the total count of words of the same category as +concept :math:`c` ([RES95]_ only considered nouns) where each word has +some representation in the wordnet, and :math:`\text{freq}` is defined +as the sum of corpus counts of words in :math:`\text{words}(c)`, which +is the set of words subsumed by concept :math:`c`: + +.. math:: + + \text{freq}(c) = \sum_{n \in \text{words}(c)}{\text{count}(n)} + +That is, the frequency of a concept like **stone fruit** is not the +number of occurrences of *stone fruit* or *stone fruits*, but also +includes the counts of *almond*, *almonds*, *peach*, etc. In +algorithmic terms, when encountering a word, the counts of the synsets +of the word and all of the synsets' taxonomic ancestors are +incremented. + +It is common for :math:`\text{freq}` to not contain actual frequencies +but instead weights. These weights are calculated as the word +frequency divided by the number of synsets for that word. + + +.. [RES95] Resnik, Philip. "Using information content to evaluate + semantic similarity." In Proceedings of the 14th International + Joint Conference on Artificial Intelligence (IJCAI-95), Montreal, + Canada, pp. 448-453. 1995. + + +Calculating Information Content +------------------------------- + +.. autofunction:: information_content +.. autofunction:: synset_probability + + +Computing Corpus Weights +------------------------ + .. autofunction:: compute + + +Reading Pre-computed Information Content Files +---------------------------------------------- + +.. autofunction:: load diff --git a/tests/data/mini-lmf-1.0.xml b/tests/data/mini-lmf-1.0.xml index a29da4e..46a467c 100644 --- a/tests/data/mini-lmf-1.0.xml +++ b/tests/data/mini-lmf-1.0.xml @@ -50,7 +50,7 @@ Spanish: - + @@ -119,7 +119,7 @@ Spanish: - + a sample that is random diff --git a/tests/ic_test.py b/tests/ic_test.py new file mode 100644 index 0000000..34c95db --- /dev/null +++ b/tests/ic_test.py @@ -0,0 +1,120 @@ + +import pytest + +import wn +from wn.constants import (NOUN, VERB, ADJ, ADV) +from wn.util import synset_id_formatter +import wn.ic + + +def synsets(w: wn.Wordnet): + return { + 'information': w.synset('test-en-0001-n'), + 'illustration_example': w.synset('test-en-0002-n'), + 'sample': w.synset('test-en-0004-n'), + 'random_sample': w.synset('test-en-0005-n'), + 'random_sample2': w.synset('test-en-0008-n'), # no hypernyms + 'datum': w.synset('test-en-0006-n'), + 'illustrate_exemplify': w.synset('test-en-0003-v'), + 'resignate': w.synset('test-en-0007-v'), + } + + +words = [ + 'For', 'example', ':', 'random sample', '.', + 'This', 'will', 'illustrate', 'and', 'exemplify', '.', + 'A', 'sample', 'of', 'data', '.', +] + + +@pytest.mark.usefixtures('mini_db') +def test_compute_nodistribute_nosmoothing(): + w = wn.Wordnet('test-en:1') + ss = synsets(w) + assert wn.ic.compute(words, w, distribute_weight=False, smoothing=0) == { + NOUN: { + ss['information']: 4.0, + ss['illustration_example']: 3.0, + ss['sample']: 2.0, + ss['random_sample']: 1.0, + ss['random_sample2']: 1.0, + ss['datum']: 1.0, + None: 5.0, + }, + VERB: { + ss['illustrate_exemplify']: 2.0, + ss['resignate']: 0.0, + None: 2.0, + }, + ADJ: {None: 0.0}, + ADV: {None: 0.0}, + } + + +@pytest.mark.usefixtures('mini_db') +def test_compute_nodistribute_smoothing(): + w = wn.Wordnet('test-en:1') + ss = synsets(w) + assert wn.ic.compute(words, w, distribute_weight=False, smoothing=1.0) == { + NOUN: { + ss['information']: 5.0, + ss['illustration_example']: 4.0, + ss['sample']: 3.0, + ss['random_sample']: 2.0, + ss['random_sample2']: 2.0, + ss['datum']: 2.0, + None: 6.0, + }, + VERB: { + ss['illustrate_exemplify']: 3.0, + ss['resignate']: 1.0, + None: 3.0, + }, + ADJ: {None: 1.0}, + ADV: {None: 1.0}, + } + + +@pytest.mark.usefixtures('mini_db') +def test_compute_distribute_smoothing(): + w = wn.Wordnet('test-en:1') + ss = synsets(w) + assert wn.ic.compute(words, w, distribute_weight=True, smoothing=1.0) == { + NOUN: { + ss['information']: 4.5, + ss['illustration_example']: 3.5, + ss['sample']: 2.5, + ss['random_sample']: 1.5, + ss['random_sample2']: 1.5, + ss['datum']: 2.0, + None: 5.0, + }, + VERB: { + ss['illustrate_exemplify']: 3.0, + ss['resignate']: 1.0, + None: 3.0, + }, + ADJ: {None: 1.0}, + ADV: {None: 1.0}, + } + + +@pytest.mark.usefixtures('mini_db') +def test_load(tmp_path): + w = wn.Wordnet('test-en:1') + icpath = tmp_path / 'foo.dat' + icpath.write_text( + 'wnver:1234567890AbCdEf\n' + '1n 4.0 ROOT\n' + '2n 3.0\n' + '4n 2.0\n' + '5n 1.0\n' + '8n 1.0 ROOT\n' + '6n 1.0\n' + '3v 2.0 ROOT\n' + '7v 0.0 ROOT\n' + ) + + get_synset_id = synset_id_formatter('test-en-{offset:04}-{pos}') + assert (wn.ic.load(icpath, w, get_synset_id=get_synset_id) + == wn.ic.compute(words, w, distribute_weight=False, smoothing=0.0)) diff --git a/tests/secondary_query_test.py b/tests/secondary_query_test.py index 9ac9e97..20beb2f 100644 --- a/tests/secondary_query_test.py +++ b/tests/secondary_query_test.py @@ -88,7 +88,7 @@ def test_synset_ili(): assert isinstance(wn.synset('test-en-0001-n').ili, wn.ILI) assert wn.synset('test-en-0001-n').ili.id == 'i67447' assert wn.synset('test-en-0001-n').ili.status == 'presupposed' - assert wn.synset('test-en-0005-n-fake').ili is None + assert wn.synset('test-en-0008-n').ili is None assert wn.synset('test-en-0007-v').ili.id is None assert wn.synset('test-en-0007-v').ili.status == 'proposed' diff --git a/wn/ic.py b/wn/ic.py index a75b098..0fa67b8 100644 --- a/wn/ic.py +++ b/wn/ic.py @@ -1,82 +1,214 @@ -r"""Information Content +r"""Information Content""" -The Information Content (IC) of a concept (synset) is a measure of its -specificity computed from the wordnet's taxonomy structure and corpus -frequencies. It is defined by Resnik 1995 ([RES95]_), following -information theory, as the negative log-probability of a concept: - -.. math:: - - IC(c) = -\log{p(c)} - -A concept's probability is the empirical probability over a corpus: - -.. math:: - - p(c) = \frac{\text{freq}(c)}{N} - -Here, :math:`N` is the total count of words of the same category as -concept :math:`c` ([RES95]_ only considered nouns) where each word has -some representation in the wordnet, and :math:`\text{freq}` is defined -as the sum of corpus counts of words in :math:`\text{words}(c)`, which -is the set of words subsumed by concept :math:`c`: - -.. math:: - - \text{freq}(c) = \sum_{n \in \text{words}(c)}{\text{count}(n)} - - -.. [RES95] Resnik, Philip. "Using information content to evaluate - semantic similarity." In Proceedings of the 14th International - Joint Conference on Artificial Intelligence (IJCAI-95), Montreal, - Canada, pp. 448-453. 1995. - -""" - -from typing import Iterable, Dict +from typing import ( + Callable, Optional, Iterator, Iterable, Dict, List, Tuple, Set, TextIO +) +from pathlib import Path from collections import Counter +from math import log +from wn._types import AnyPath from wn._core import Synset, Wordnet from wn.constants import NOUN, VERB, ADJ, ADV, ADJ_SAT +from wn.util import synset_id_formatter + # Just use a subset of all available parts of speech IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV)) +Freq = Dict[str, Dict[Optional[Synset], float]] -Corpus = Iterable[str] -IC = Dict[str, Dict[Synset, float]] # {pos: {synset: ic}} +def information_content(synset: Synset, freq: Freq) -> float: + """Calculate the Information Content value for a synset.""" + return -log(synset_probability(synset, freq)) -def compute( - corpus: Corpus, - wordnet: Wordnet, - distribute_weight: bool = True, - smoothing: float = 1.0 -) -> IC: +def synset_probability(synset: Synset, freq: Freq) -> float: + """Calculate the synset probability.""" + pos = synset.pos + return freq[pos][synset] / freq[pos][None] - counts = Counter(corpus) - # intialize with the smoothing value - ic: IC = {pos: {synset: smoothing - for synset in wordnet.synsets(pos=pos)} - for pos in IC_PARTS_OF_SPEECH} +def _initialize( + wordnet: Wordnet, + smoothing: float = 0.0, +) -> Freq: + """Populate an Information Content weight mapping to a smoothing value. + + All synsets in *wordnet* are inserted into the dictionary and + mapped to *smoothing*. + + """ + freq: Freq = { + pos: {synset: smoothing for synset in wordnet.synsets(pos=pos)} + for pos in IC_PARTS_OF_SPEECH + } # pretend ADJ_SAT is just ADJ for synset in wordnet.synsets(pos=ADJ_SAT): - ic[ADJ][synset] = smoothing + freq[ADJ][synset] = smoothing + # also initialize totals (when synset is None) for each part-of-speech + for pos in IC_PARTS_OF_SPEECH: + freq[pos][None] = smoothing + return freq + +def compute( + corpus: Iterable[str], + wordnet: Wordnet, + distribute_weight: bool = True, + smoothing: float = 0.0 +) -> Freq: + """Compute Information Content weights from a corpus. + + Arguments: + corpus: An iterable of string tokens. This is a flat list of + words and the order does not matter. Tokens may be single + words or multiple words separated by a space. + + wordnet: An instantiated :class:`wn.Wordnet` object, used to + look up synsets from words. + + distribute_weight: If :python:`True`, the counts for a word + are divided evenly among all synsets for the word. + + smoothing: The initial value given to each synset. + + Example: + >>> import wn, wn.ic, wn.morphy + >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=wn.morphy.morphy) + >>> freq = wn.ic.compute(["Dogs", "run", ".", "Cats", "sleep", "."], ewn) + >>> dog = ewn.synsets('dog', pos='n')[0] + >>> cat = ewn.synsets('cat', pos='n')[0] + >>> frog = ewn.synsets('frog', pos='n')[0] + >>> freq['n'][dog] + 1.125 + >>> freq['n'][cat] + 1.1 + >>> freq['n'][frog] # no occurrence; smoothing value only + 1.0 + >>> ancestor = dog.lowest_common_hypernyms(cat)[0] # 'carnivore' + >>> freq['n'][ancestor] + 1.3250000000000002 + """ + freq = _initialize(wordnet, smoothing=smoothing) + counts = Counter(corpus) + + hypernym_cache: Dict[Synset, List[Synset]] = {} for word, count in counts.items(): synsets = wordnet.synsets(word) num = len(synsets) if num == 0: continue - weight = count / num if distribute_weight else count + + weight = float(count / num if distribute_weight else count) + for synset in synsets: pos = synset.pos if pos == ADJ_SAT: pos = ADJ if pos not in IC_PARTS_OF_SPEECH: continue - ic[pos][synset] += weight - return ic + freq[pos][None] += weight + + # The following while-loop is equivalent to: + # + # freq[pos][synset] += weight + # for path in synset.hypernym_paths(): + # for ss in path: + # freq[pos][ss] += weight + # + # ...but it caches hypernym lookups for speed + + agenda: List[Tuple[Synset, Set[Synset]]] = [(synset, set())] + while agenda: + ss, seen = agenda.pop() + + # avoid cycles + if ss in seen: + continue + + freq[pos][ss] += weight + + if ss not in hypernym_cache: + hypernym_cache[ss] = ss.hypernyms() + agenda.extend((hyp, seen | {ss}) for hyp in hypernym_cache[ss]) + + return freq + + +def load( + source: AnyPath, + wordnet: Wordnet, + get_synset_id: Optional[Callable] = None, +) -> Freq: + """Load an Information Content mapping from a file. + + The *source* argument is a path to an Information Content (IC) + file as used by the WordNet::Similarity Perl module or the + NLTK. The *wordnet* argument is a :class:`wn.Wordnet` instance + **with synset identifiers matching the offsets in the IC file**. A + :class:`wn.Error` is raised if *wordnet* does not have exactly one + lexicon. + + The *get_synset_id* argument should be a callable that returns a + valid synset ID when called as follows: + + .. code-block:: python + + get_synset_id(offset=offset, pos=pos) + + The integer ``offset`` and string ``pos`` arguments come from the + offsets and parts-of-speech parsed from the IC file. If + *get_synset_id* is :python:`None`, a default function is created + with :func:`wn.util.synset_id_formatter` as follows: + + .. code-block:: python + + get_synset_id = synset_id_formatter(prefix=wordnet.lexicons()[0].id) + + Example: + + >>> import wn, wn.ic + >>> pwn = wn.Wordnet('pwn:3.0') + >>> path = '~/nltk_data/corpora/wordnet_ic/ic-brown.dat' + >>> freq = wn.ic.load(path, pwn) + + """ + assert len(wordnet.lexicons()) == 1 + lexid = wordnet.lexicons()[0].id + if get_synset_id is None: + get_synset_id = synset_id_formatter(prefix=lexid) + + freq = _initialize(wordnet, smoothing=0.0) + + source = Path(source).expanduser() + with source.open() as icfile: + for offset, pos, weight, is_root in _parse_ic_file(icfile): + ssid = get_synset_id(offset=offset, pos=pos) + synset = wordnet.synset(ssid) + freq[pos][synset] = weight + if is_root: + freq[pos][None] += weight + return freq + + +def _parse_ic_file(icfile: TextIO) -> Iterator[Tuple[int, str, float, bool]]: + """Parse the Information Content file. + + A sample of the format is:: + + wnver::eOS9lXC6GvMWznF1wkZofDdtbBU + 1740n 1915712 ROOT + 1930n 859272 + 2137n 1055337 + + """ + next(icfile) # skip header + for line in icfile: + ssinfo, value, *isroot = line.split() + yield (int(ssinfo[:-1]), + ssinfo[-1], + float(value), + bool(isroot)) From 06873e79b4f4c765775d618ec8cd2f6f651b78c2 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 17 Jun 2021 22:32:06 +0800 Subject: [PATCH 04/20] Info Content weight mapping now hashes synset IDs Before it hashed the synset objects, which was noticeably slower. This also means that loading IC from a file no longer checks if the synset exists in the wordnet before storing the weight. --- tests/data/mini-lmf-1.0.xml | 1 + tests/ic_test.py | 72 ++++++++++++++++++------------------- wn/ic.py | 30 ++++++++-------- 3 files changed, 50 insertions(+), 53 deletions(-) diff --git a/tests/data/mini-lmf-1.0.xml b/tests/data/mini-lmf-1.0.xml index 46a467c..48663ac 100644 --- a/tests/data/mini-lmf-1.0.xml +++ b/tests/data/mini-lmf-1.0.xml @@ -6,6 +6,7 @@ with the following words and hypernym/derivation relations: English: - information ⊃ (example, illustration) ⊃ sample ⊃ random sample +- information ⊃ datum - example ⊳ exemplify - illustration ⊳ illustrate - resignate diff --git a/tests/ic_test.py b/tests/ic_test.py index 34c95db..0f299fe 100644 --- a/tests/ic_test.py +++ b/tests/ic_test.py @@ -7,17 +7,16 @@ import wn.ic -def synsets(w: wn.Wordnet): - return { - 'information': w.synset('test-en-0001-n'), - 'illustration_example': w.synset('test-en-0002-n'), - 'sample': w.synset('test-en-0004-n'), - 'random_sample': w.synset('test-en-0005-n'), - 'random_sample2': w.synset('test-en-0008-n'), # no hypernyms - 'datum': w.synset('test-en-0006-n'), - 'illustrate_exemplify': w.synset('test-en-0003-v'), - 'resignate': w.synset('test-en-0007-v'), - } +synset_id = { + 'information': 'test-en-0001-n', + 'illustration_example': 'test-en-0002-n', + 'sample': 'test-en-0004-n', + 'random_sample': 'test-en-0005-n', + 'random_sample2': 'test-en-0008-n', # no hypernyms + 'datum': 'test-en-0006-n', + 'illustrate_exemplify': 'test-en-0003-v', + 'resignate': 'test-en-0007-v', +} words = [ @@ -30,20 +29,19 @@ def synsets(w: wn.Wordnet): @pytest.mark.usefixtures('mini_db') def test_compute_nodistribute_nosmoothing(): w = wn.Wordnet('test-en:1') - ss = synsets(w) assert wn.ic.compute(words, w, distribute_weight=False, smoothing=0) == { NOUN: { - ss['information']: 4.0, - ss['illustration_example']: 3.0, - ss['sample']: 2.0, - ss['random_sample']: 1.0, - ss['random_sample2']: 1.0, - ss['datum']: 1.0, + synset_id['information']: 4.0, + synset_id['illustration_example']: 3.0, + synset_id['sample']: 2.0, + synset_id['random_sample']: 1.0, + synset_id['random_sample2']: 1.0, + synset_id['datum']: 1.0, None: 5.0, }, VERB: { - ss['illustrate_exemplify']: 2.0, - ss['resignate']: 0.0, + synset_id['illustrate_exemplify']: 2.0, + synset_id['resignate']: 0.0, None: 2.0, }, ADJ: {None: 0.0}, @@ -54,20 +52,19 @@ def test_compute_nodistribute_nosmoothing(): @pytest.mark.usefixtures('mini_db') def test_compute_nodistribute_smoothing(): w = wn.Wordnet('test-en:1') - ss = synsets(w) assert wn.ic.compute(words, w, distribute_weight=False, smoothing=1.0) == { NOUN: { - ss['information']: 5.0, - ss['illustration_example']: 4.0, - ss['sample']: 3.0, - ss['random_sample']: 2.0, - ss['random_sample2']: 2.0, - ss['datum']: 2.0, + synset_id['information']: 5.0, + synset_id['illustration_example']: 4.0, + synset_id['sample']: 3.0, + synset_id['random_sample']: 2.0, + synset_id['random_sample2']: 2.0, + synset_id['datum']: 2.0, None: 6.0, }, VERB: { - ss['illustrate_exemplify']: 3.0, - ss['resignate']: 1.0, + synset_id['illustrate_exemplify']: 3.0, + synset_id['resignate']: 1.0, None: 3.0, }, ADJ: {None: 1.0}, @@ -78,20 +75,19 @@ def test_compute_nodistribute_smoothing(): @pytest.mark.usefixtures('mini_db') def test_compute_distribute_smoothing(): w = wn.Wordnet('test-en:1') - ss = synsets(w) assert wn.ic.compute(words, w, distribute_weight=True, smoothing=1.0) == { NOUN: { - ss['information']: 4.5, - ss['illustration_example']: 3.5, - ss['sample']: 2.5, - ss['random_sample']: 1.5, - ss['random_sample2']: 1.5, - ss['datum']: 2.0, + synset_id['information']: 4.5, + synset_id['illustration_example']: 3.5, + synset_id['sample']: 2.5, + synset_id['random_sample']: 1.5, + synset_id['random_sample2']: 1.5, + synset_id['datum']: 2.0, None: 5.0, }, VERB: { - ss['illustrate_exemplify']: 3.0, - ss['resignate']: 1.0, + synset_id['illustrate_exemplify']: 3.0, + synset_id['resignate']: 1.0, None: 3.0, }, ADJ: {None: 1.0}, diff --git a/wn/ic.py b/wn/ic.py index 0fa67b8..7191f97 100644 --- a/wn/ic.py +++ b/wn/ic.py @@ -16,7 +16,7 @@ # Just use a subset of all available parts of speech IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV)) -Freq = Dict[str, Dict[Optional[Synset], float]] +Freq = Dict[str, Dict[Optional[str], float]] def information_content(synset: Synset, freq: Freq) -> float: @@ -26,8 +26,8 @@ def information_content(synset: Synset, freq: Freq) -> float: def synset_probability(synset: Synset, freq: Freq) -> float: """Calculate the synset probability.""" - pos = synset.pos - return freq[pos][synset] / freq[pos][None] + pos_freq = freq[synset.pos] + return pos_freq[synset.id] / pos_freq[None] def _initialize( @@ -41,12 +41,12 @@ def _initialize( """ freq: Freq = { - pos: {synset: smoothing for synset in wordnet.synsets(pos=pos)} + pos: {synset.id: smoothing for synset in wordnet.synsets(pos=pos)} for pos in IC_PARTS_OF_SPEECH } # pretend ADJ_SAT is just ADJ for synset in wordnet.synsets(pos=ADJ_SAT): - freq[ADJ][synset] = smoothing + freq[ADJ][synset.id] = smoothing # also initialize totals (when synset is None) for each part-of-speech for pos in IC_PARTS_OF_SPEECH: freq[pos][None] = smoothing @@ -81,14 +81,14 @@ def compute( >>> dog = ewn.synsets('dog', pos='n')[0] >>> cat = ewn.synsets('cat', pos='n')[0] >>> frog = ewn.synsets('frog', pos='n')[0] - >>> freq['n'][dog] + >>> freq['n'][dog.id] 1.125 - >>> freq['n'][cat] + >>> freq['n'][cat.id] 1.1 - >>> freq['n'][frog] # no occurrence; smoothing value only + >>> freq['n'][frog.id] # no occurrence; smoothing value only 1.0 - >>> ancestor = dog.lowest_common_hypernyms(cat)[0] # 'carnivore' - >>> freq['n'][ancestor] + >>> carnivore = dog.lowest_common_hypernyms(cat)[0] + >>> freq['n'][carnivore.id] 1.3250000000000002 """ freq = _initialize(wordnet, smoothing=smoothing) @@ -114,10 +114,10 @@ def compute( # The following while-loop is equivalent to: # - # freq[pos][synset] += weight + # freq[pos][synset.id] += weight # for path in synset.hypernym_paths(): # for ss in path: - # freq[pos][ss] += weight + # freq[pos][ss.id] += weight # # ...but it caches hypernym lookups for speed @@ -129,7 +129,7 @@ def compute( if ss in seen: continue - freq[pos][ss] += weight + freq[pos][ss.id] += weight if ss not in hypernym_cache: hypernym_cache[ss] = ss.hypernyms() @@ -187,8 +187,8 @@ def load( with source.open() as icfile: for offset, pos, weight, is_root in _parse_ic_file(icfile): ssid = get_synset_id(offset=offset, pos=pos) - synset = wordnet.synset(ssid) - freq[pos][synset] = weight + # synset = wordnet.synset(ssid) + freq[pos][ssid] = weight if is_root: freq[pos][None] += weight return freq From 77f1a25af5e46e767f247237ba2bba39a7109741 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 17 Jun 2021 22:34:57 +0800 Subject: [PATCH 05/20] Check if IC file exists before initializing dict Just a usability improvement so it fails fast when the path is wrong. --- wn/ic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wn/ic.py b/wn/ic.py index 7191f97..f35d472 100644 --- a/wn/ic.py +++ b/wn/ic.py @@ -176,6 +176,7 @@ def load( >>> freq = wn.ic.load(path, pwn) """ + source = Path(source).expanduser().resolve(strict=True) assert len(wordnet.lexicons()) == 1 lexid = wordnet.lexicons()[0].id if get_synset_id is None: @@ -183,7 +184,6 @@ def load( freq = _initialize(wordnet, smoothing=0.0) - source = Path(source).expanduser() with source.open() as icfile: for offset, pos, weight, is_root in _parse_ic_file(icfile): ssid = get_synset_id(offset=offset, pos=pos) From 7aa8e0acf6fde4b342cb5f96a5afe334d44adfa6 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Fri, 18 Jun 2021 00:03:28 +0800 Subject: [PATCH 06/20] Add some docs for wn.ic --- docs/api/wn.ic.rst | 10 ++++++++++ wn/ic.py | 18 ++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/docs/api/wn.ic.rst b/docs/api/wn.ic.rst index 6dc8aed..40da6f8 100644 --- a/docs/api/wn.ic.rst +++ b/docs/api/wn.ic.rst @@ -43,6 +43,16 @@ It is common for :math:`\text{freq}` to not contain actual frequencies but instead weights. These weights are calculated as the word frequency divided by the number of synsets for that word. +.. note:: + + The term *information content* can be ambiguous. It sometimes + refers to the result of the :func:`information_content` function, + but is also used to refer to the corpus frequencies/weights in the + data structure returned by :func:`load` or :func:`compute`, as + these weights are the basis of the value computed by + :func:`information_content`. The Wn documentation tries to + consistently refer to former as the *information content value* and + the latter as *information content weights*. .. [RES95] Resnik, Philip. "Using information content to evaluate semantic similarity." In Proceedings of the 14th International diff --git a/wn/ic.py b/wn/ic.py index f35d472..a85693f 100644 --- a/wn/ic.py +++ b/wn/ic.py @@ -20,12 +20,26 @@ def information_content(synset: Synset, freq: Freq) -> float: - """Calculate the Information Content value for a synset.""" + """Calculate the Information Content value for a synset. + + The information content of a synset is the negative log of the + synset probability (see :func:`synset_probability`). + + """ return -log(synset_probability(synset, freq)) def synset_probability(synset: Synset, freq: Freq) -> float: - """Calculate the synset probability.""" + """Calculate the synset probability. + + The synset probability is defined as freq(ss)/N where freq(ss) is + the IC weight for the synset and N is the total IC weight for all + synsets with the same part of speech. + + Note: this function is not generally used directly, but indirectly + through :func:`information_content`. + + """ pos_freq = freq[synset.pos] return pos_freq[synset.id] / pos_freq[None] From 825171cce6bc661211a8ded8a1ebc77d14ae4557 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Mon, 21 Jun 2021 15:52:41 +0800 Subject: [PATCH 07/20] Update IC documentation; default smoothing = 1.0 --- docs/api/wn.ic.rst | 128 +++++++++++++++++++++++++++++++++++++-------- wn/ic.py | 39 ++++++-------- 2 files changed, 123 insertions(+), 44 deletions(-) diff --git a/docs/api/wn.ic.rst b/docs/api/wn.ic.rst index 40da6f8..ffb6e0e 100644 --- a/docs/api/wn.ic.rst +++ b/docs/api/wn.ic.rst @@ -4,8 +4,30 @@ wn.ic .. automodule:: wn.ic -Description ------------ +The mathematical formulae for information content are defined in +`Formal Description`_, and the corresponding Python API function are +described in `Calculating Information Content`_. These functions +require information content weights obtained either by `computing them +from a corpus `_, or by `loading +pre-computed weights from a file `_. + +.. note:: + + The term *information content* can be ambiguous. It often, and most + accurately, refers to the result of the :func:`information_content` + function (:math:`\text{IC}(c)` in the mathematical notation), but + is also sometimes used to refer to the corpus frequencies/weights + (:math:`\text{freq}(c)` in the mathematical notation) returned by + :func:`load` or :func:`compute`, as these weights are the basis of + the value computed by :func:`information_content`. The Wn + documentation tries to consistently refer to former as the + *information content value*, or just *information content*, and the + latter as *information content weights*, or *weights*. + + +Formal Description +------------------ The Information Content (IC) of a concept (synset) is a measure of its specificity computed from the wordnet's taxonomy structure and corpus @@ -14,7 +36,7 @@ information theory, as the negative log-probability of a concept: .. math:: - IC(c) = -\log{p(c)} + \text{IC}(c) = -\log{p(c)} A concept's probability is the empirical probability over a corpus: @@ -30,29 +52,17 @@ is the set of words subsumed by concept :math:`c`: .. math:: - \text{freq}(c) = \sum_{n \in \text{words}(c)}{\text{count}(n)} - -That is, the frequency of a concept like **stone fruit** is not the -number of occurrences of *stone fruit* or *stone fruits*, but also -includes the counts of *almond*, *almonds*, *peach*, etc. In -algorithmic terms, when encountering a word, the counts of the synsets -of the word and all of the synsets' taxonomic ancestors are -incremented. + \text{freq}(c) = \sum_{w \in \text{words}(c)}{\text{count}(w)} It is common for :math:`\text{freq}` to not contain actual frequencies -but instead weights. These weights are calculated as the word -frequency divided by the number of synsets for that word. +but instead weights distributed evenly among the synsets for a +word. These weights are calculated as the word frequency divided by +the number of synsets for the word: -.. note:: +.. math:: - The term *information content* can be ambiguous. It sometimes - refers to the result of the :func:`information_content` function, - but is also used to refer to the corpus frequencies/weights in the - data structure returned by :func:`load` or :func:`compute`, as - these weights are the basis of the value computed by - :func:`information_content`. The Wn documentation tries to - consistently refer to former as the *information content value* and - the latter as *information content weights*. + \text{freq}_{\text{distributed}}(c) + = \sum_{w \in \text{words}(c)}{\frac{\text{count}(w)}{|\text{synsets}(w)|}} .. [RES95] Resnik, Philip. "Using information content to evaluate semantic similarity." In Proceedings of the 14th International @@ -60,6 +70,29 @@ frequency divided by the number of synsets for that word. Canada, pp. 448-453. 1995. +Example +------- + +In the Princeton WordNet, the frequency of a concept like **stone +fruit** is not the number of occurrences of *stone fruit*, but also +includes the counts of the words for its hyponyms (*almond*, *olive*, +etc.) and other taxonomic descendants (*Jordan almond*, *green olive*, +etc.). The word *almond* has two synsets: one for the fruit or nut, +another for the plant. Thus, if the word *almond* is encountered +:math:`n` times in a corpus, then the weight (either the frequency +:math:`n` or distributed weight :math:`\frac{n}{2}`) is added to the +total weights for both synsets and to those of their ancestors, but +not for descendant synsets, such as for **Jordan almond**. The fruit/nut +synset of almond has two hypernym paths which converge on **fruit**: + +1. **almond** ⊃ **stone fruit** ⊃ **fruit** +2. **almond** ⊃ **nut** ⊃ **seed** ⊃ **fruit** + +The weight is added to each ancestor (**stone fruit**, **nut**, +**seed**, **fruit**, ...) once. That is, the weight is not added to +the convergent ancestor for **fruit** twice, but only once. + + Calculating Information Content ------------------------------- @@ -70,10 +103,61 @@ Calculating Information Content Computing Corpus Weights ------------------------ +If pre-computed weights are not available for a wordnet or for some +domain, they can be computed given a corpus and a wordnet. + +The corpus is an iterable of words. For large corpora it may help to +use a generator for this iterable, but the entire vocabulary (i.e., +unique words and counts) will be held at once in memory. Multi-word +expressions are also possible if they exist in the wordnet. For +instance, the Princeton WordNet has *stone fruit*, with a single space +delimiting the words, as an entry. + +The :class:`wn.Wordnet` object must be instantiated with a single +lexicon, although it may have expand-lexicons for relation +traversal. For best results, the wordnet should use a lemmatizer to +help it deal with inflected wordforms from running text. + .. autofunction:: compute Reading Pre-computed Information Content Files ---------------------------------------------- +The :func:`load` function reads pre-computed information content +weights files as used by the `WordNet::Similarity +`_ Perl module or the `NLTK +`_ Python package. These files are computed for +a specific version of a wordnet using the synset offsets from the +`WNDB `_ format, +which Wn does not use. These offsets therefore must be converted into +an identifier that matches those used by the wordnet. By default, +:func:`load` uses the lexicon identifier from its *wordnet* argument +with synset offsets (padded with 0s to make 8 digits) and +parts-of-speech from the weights file to format an identifier, such as +``pwn-00001174-n``. For wordnets that use a different identifier +scheme, the *get_synset_id* parameter of :func:`load` can be given a +callable created with :func:`wn.util.synset_id_formatter`. It can also +be given another callable with the same signature as shown below: + +.. code-block:: python + + get_synset_id(*, offset: int, pos: str) -> str + +.. warning:: + + The weights files are only valid for the version of wordnet for + which they were created. Files created for the Princeton WordNet + 3.0 do not work for the Princeton WordNet 3.1 because the offsets + used in its identifiers are different, although the *get_synset_id* + parameter of :func:`load` could be given a function that performs a + suitable mapping. Some `Open Multilingual Wordnet + `_ wordnets use the Princeton + WordNet 3.0 offsets in their identifiers and can therefore + technically use the weights, but this usage is discouraged because + the distributional properties of text in another language and the + structure of the other wordnet will not be compatible with that of + the Princeton WordNet. For these cases, it is recommended to + compute new weights using :func:`compute`. + .. autofunction:: load diff --git a/wn/ic.py b/wn/ic.py index a85693f..68868df 100644 --- a/wn/ic.py +++ b/wn/ic.py @@ -1,5 +1,8 @@ -r"""Information Content""" +"""Information Content is a corpus-based metrics of synset or sense +specificity. + +""" from typing import ( Callable, Optional, Iterator, Iterable, Dict, List, Tuple, Set, TextIO @@ -46,7 +49,7 @@ def synset_probability(synset: Synset, freq: Freq) -> float: def _initialize( wordnet: Wordnet, - smoothing: float = 0.0, + smoothing: float, ) -> Freq: """Populate an Information Content weight mapping to a smoothing value. @@ -71,7 +74,7 @@ def compute( corpus: Iterable[str], wordnet: Wordnet, distribute_weight: bool = True, - smoothing: float = 0.0 + smoothing: float = 1.0 ) -> Freq: """Compute Information Content weights from a corpus. @@ -105,7 +108,7 @@ def compute( >>> freq['n'][carnivore.id] 1.3250000000000002 """ - freq = _initialize(wordnet, smoothing=smoothing) + freq = _initialize(wordnet, smoothing) counts = Counter(corpus) hypernym_cache: Dict[Synset, List[Synset]] = {} @@ -159,28 +162,20 @@ def load( ) -> Freq: """Load an Information Content mapping from a file. - The *source* argument is a path to an Information Content (IC) - file as used by the WordNet::Similarity Perl module or the - NLTK. The *wordnet* argument is a :class:`wn.Wordnet` instance - **with synset identifiers matching the offsets in the IC file**. A - :class:`wn.Error` is raised if *wordnet* does not have exactly one - lexicon. - - The *get_synset_id* argument should be a callable that returns a - valid synset ID when called as follows: + Arguments: - .. code-block:: python + source: A path to an information content weights file. - get_synset_id(offset=offset, pos=pos) + wordnet: A :class:`wn.Wordnet` instance with synset + identifiers matching the offsets in the weights file. - The integer ``offset`` and string ``pos`` arguments come from the - offsets and parts-of-speech parsed from the IC file. If - *get_synset_id* is :python:`None`, a default function is created - with :func:`wn.util.synset_id_formatter` as follows: + get_synset_id: A callable that takes a synset offset and part + of speech and returns a synset ID valid in *wordnet*. - .. code-block:: python + Raises: - get_synset_id = synset_id_formatter(prefix=wordnet.lexicons()[0].id) + :class:`wn.Error`: If *wordnet* does not have exactly one + lexicon. Example: @@ -196,7 +191,7 @@ def load( if get_synset_id is None: get_synset_id = synset_id_formatter(prefix=lexid) - freq = _initialize(wordnet, smoothing=0.0) + freq = _initialize(wordnet, 0.0) with source.open() as icfile: for offset, pos, weight, is_root in _parse_ic_file(icfile): From 7280bcfcb3a64ceb76e44ca0fde38845aacc3c22 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 12:39:01 +0800 Subject: [PATCH 08/20] Close #125: Add wn.taxonomy module --- CHANGELOG.md | 13 ++ docs/api/wn.rst | 32 +++- docs/api/wn.taxonomy.rst | 66 ++++++++ docs/index.rst | 1 + tests/paths_test.py | 64 ------- tests/taxonomy_test.py | 113 +++++++++++++ wn/_core.py | 172 +++---------------- wn/taxonomy.py | 351 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 590 insertions(+), 222 deletions(-) create mode 100644 docs/api/wn.taxonomy.rst delete mode 100644 tests/paths_test.py create mode 100644 tests/taxonomy_test.py create mode 100644 wn/taxonomy.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8537634..027f194 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ ## [Unreleased] +### Added + +* `wn.ic` module ([#40] +* `wn.taxonomy` module ([#125]) + +### Changed + +* Taxonomy methods on `wn.Synset` are moved to `wn.taxonomy`, but + shortcut methods remain for compatibility ([#125]). + + ## [v0.7.0] **Release date: 2021-06-09** @@ -367,6 +378,7 @@ abandoned, but this is an entirely new codebase. [#17]: https://github.com/goodmami/wn/issues/17 [#19]: https://github.com/goodmami/wn/issues/19 [#23]: https://github.com/goodmami/wn/issues/23 +[#40]: https://github.com/goodmami/wn/issues/40 [#46]: https://github.com/goodmami/wn/issues/46 [#47]: https://github.com/goodmami/wn/issues/47 [#58]: https://github.com/goodmami/wn/issues/58 @@ -406,3 +418,4 @@ abandoned, but this is an entirely new codebase. [#115]: https://github.com/goodmami/wn/issues/115 [#116]: https://github.com/goodmami/wn/issues/116 [#117]: https://github.com/goodmami/wn/issues/117 +[#125]: https://github.com/goodmami/wn/issues/125 diff --git a/docs/api/wn.rst b/docs/api/wn.rst index 3084b65..18c8a2a 100644 --- a/docs/api/wn.rst +++ b/docs/api/wn.rst @@ -218,17 +218,37 @@ The Synset Class .. automethod:: hyponyms .. automethod:: holonyms .. automethod:: meronyms - .. automethod:: hypernym_paths - .. automethod:: min_depth - .. automethod:: max_depth - .. automethod:: shortest_path - .. automethod:: common_hypernyms - .. automethod:: lowest_common_hypernyms .. automethod:: get_related .. automethod:: closure .. automethod:: relation_paths .. automethod:: translate + .. The taxonomy methods below have been moved to wn.taxonomy + + .. method:: hypernym_paths(simulate_root=False) + + Shortcut for :func:`wn.taxonomy.hypernym_paths`. + + .. method:: min_depth(simulate_root=False) + + Shortcut for :func:`wn.taxonomy.min_depth`. + + .. method:: max_depth(simulate_root=False) + + Shortcut for :func:`wn.taxonomy.max_depth`. + + .. method:: shortest_path(other, simulate_root=False) + + Shortcut for :func:`wn.taxonomy.shortest_path`. + + .. method:: common_hypernyms(other, simulate_root=False) + + Shortcut for :func:`wn.taxonomy.common_hypernyms`. + + .. method:: lowest_common_hypernyms(other, simulate_root=False) + + Shortcut for :func:`wn.taxonomy.lowest_common_hypernyms`. + The ILI Class ------------- diff --git a/docs/api/wn.taxonomy.rst b/docs/api/wn.taxonomy.rst new file mode 100644 index 0000000..41ac25f --- /dev/null +++ b/docs/api/wn.taxonomy.rst @@ -0,0 +1,66 @@ + +wn.taxonomy +=========== + +.. automodule:: wn.taxonomy + + +Overview +-------- + +Among the valid synset relations for wordnets (see +:data:`wn.constants.SYNSET_RELATIONS`), those used for describing +*is-a* `taxonomies `_ are +given special treatment and they are generally the most +well-developed relations in any wordnet. Typically these are the +``hypernym`` and ``hyponym`` relations, which encode *is-a-type-of* +relationships (e.g., a *hermit crab* is a type of *decapod*, which is +a type of *crustacean*, etc.). They also include ``instance_hypernym`` +and ``instance_hyponym``, which encode *is-an-instance-of* +relationships (e.g., *Oregon* is an instance of *American state*). + +The taxonomy forms a multiply-inheriting hierarchy with the synsets as +nodes. In the English wordnets, such as the Princeton WordNet, nearly +all nominal synsets form such a hierarchy with single root node, while +verbal synsets form many smaller hierarchies without a common +root. Other wordnets may have different properties, but as many are +based off of the Princeton WordNet, they tend to follow this +structure. + +Functions to find paths within the taxonomies form the basis of all +:mod:`wordnet similarity measures `. For instance, the +:ref:`leacock-chodorow-similarity` measure uses both +:func:`shortest_path` and (indirectly) :func:`taxonomy_depth`. + + +Wordnet-level Functions +----------------------- + +Root and leaf synsets in the taxonomy are those with no ancestors +(``hypernym``, ``instance_hypernym``, etc.) or hyponyms (``hyponym``, +``instance_hyponym``, etc.), respectively. + +Finding root and leaf synsets +''''''''''''''''''''''''''''' + +.. autofunction:: roots +.. autofunction:: leaves + +Computing the taxonomy depth +'''''''''''''''''''''''''''' + +The taxonomy depth is the maximum depth from a root node to a leaf +node within synsets for a particular part of speech. + +.. autofunction:: taxonomy_depth + + +Synset-level Functions +---------------------- + +.. autofunction:: hypernym_paths +.. autofunction:: min_depth +.. autofunction:: max_depth +.. autofunction:: shortest_path +.. autofunction:: common_hypernyms +.. autofunction:: lowest_common_hypernyms diff --git a/docs/index.rst b/docs/index.rst index 877b0f4..bc52e04 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -63,5 +63,6 @@ Contents api/wn.morphy.rst api/wn.project.rst api/wn.similarity.rst + api/wn.taxonomy.rst api/wn.util.rst api/wn.web.rst diff --git a/tests/paths_test.py b/tests/paths_test.py deleted file mode 100644 index b8af6dc..0000000 --- a/tests/paths_test.py +++ /dev/null @@ -1,64 +0,0 @@ - -import pytest - -import wn - - -@pytest.mark.usefixtures('mini_db') -def test_hypernym_paths(): - information = wn.synsets('information')[0] - example = wn.synsets('example')[0] - sample = wn.synsets('sample')[0] - random_sample = wn.synsets('random sample')[0] - assert information.hypernym_paths() == [] - assert example.hypernym_paths() == [[information]] - assert sample.hypernym_paths() == [[example, information]] - assert random_sample.hypernym_paths() == [[sample, example, information]] - - -@pytest.mark.usefixtures('mini_db') -def test_interlingual_hypernym_paths(): - información = wn.synsets('información')[0] - ejemplo = wn.synsets('ejemplo')[0] - inferred = wn.Synset.empty('*INFERRED*') - muestra_aleatoria = wn.synsets('muestra aleatoria')[0] - assert información.hypernym_paths() == [] - assert ejemplo.hypernym_paths() == [[información]] - assert muestra_aleatoria.hypernym_paths() == [[inferred, ejemplo, información]] - - -@pytest.mark.usefixtures('mini_db') -def test_shortest_path(): - information = wn.synsets('information')[0] - example = wn.synsets('example')[0] - sample = wn.synsets('sample')[0] - random_sample = wn.synsets('random sample')[0] - datum = wn.synsets('datum')[0] - exemplify = wn.synsets('exemplify')[0] - inferred_root = wn.Synset.empty('*INFERRED*') - assert information.shortest_path(information) == [] - assert information.shortest_path(datum) == [datum] - assert information.shortest_path(sample) == [example, sample] - assert sample.shortest_path(information) == [example, information] - assert random_sample.shortest_path(datum) == [sample, example, information, datum] - with pytest.raises(wn.Error): - example.shortest_path(exemplify) - assert example.shortest_path(exemplify, simulate_root=True) == [ - information, inferred_root, exemplify - ] - - -@pytest.mark.usefixtures('mini_db') -def test_min_depth(): - assert wn.synsets('information')[0].min_depth() == 0 - assert wn.synsets('example')[0].min_depth() == 1 - assert wn.synsets('sample')[0].min_depth() == 2 - assert wn.synsets('random sample')[0].min_depth() == 3 - - -@pytest.mark.usefixtures('mini_db') -def test_max_depth(): - assert wn.synsets('information')[0].max_depth() == 0 - assert wn.synsets('example')[0].max_depth() == 1 - assert wn.synsets('sample')[0].max_depth() == 2 - assert wn.synsets('random sample')[0].max_depth() == 3 diff --git a/tests/taxonomy_test.py b/tests/taxonomy_test.py new file mode 100644 index 0000000..55f6071 --- /dev/null +++ b/tests/taxonomy_test.py @@ -0,0 +1,113 @@ + +import pytest + +import wn +from wn.taxonomy import ( + roots, + leaves, + taxonomy_depth, + hypernym_paths, + min_depth, + max_depth, + shortest_path, + common_hypernyms, + lowest_common_hypernyms, +) + + +@pytest.mark.usefixtures('mini_db') +def test_roots(): + en = wn.Wordnet('test-en') + assert set(roots(en, pos='n')) == {en.synset('test-en-0001-n'), + en.synset('test-en-0008-n')} + assert set(roots(en, pos='v')) == {en.synset('test-en-0003-v'), + en.synset('test-en-0007-v')} + assert roots(en, pos='a') == [] + assert set(roots(en)) == set(roots(en, pos='n') + roots(en, pos='v')) + + # with no expand relations and no relation of its own, every + # synset looks like a root + es = wn.Wordnet('test-es') + assert set(roots(es, pos='n')) == {es.synset('test-es-0001-n'), + es.synset('test-es-0002-n'), + es.synset('test-es-0005-n')} + + es = wn.Wordnet('test-es', expand='test-en') + assert roots(es, pos='n') == [es.synset('test-es-0001-n')] + + +@pytest.mark.usefixtures('mini_db') +def test_leaves(): + en = wn.Wordnet('test-en') + assert set(leaves(en, pos='n')) == {en.synset('test-en-0005-n'), + en.synset('test-en-0006-n'), + en.synset('test-en-0008-n')} + assert set(leaves(en, pos='v')) == {en.synset('test-en-0003-v'), + en.synset('test-en-0007-v')} + + +@pytest.mark.usefixtures('mini_db') +def test_taxonomy_depth(): + en = wn.Wordnet('test-en') + assert taxonomy_depth(en, pos='n') == 3 + assert taxonomy_depth(en, pos='v') == 0 + + +@pytest.mark.usefixtures('mini_db') +def test_hypernym_paths(): + information = wn.synsets('information')[0] + example = wn.synsets('example')[0] + sample = wn.synsets('sample')[0] + random_sample = wn.synsets('random sample')[0] + assert hypernym_paths(information) == [] + assert hypernym_paths(example) == [[information]] + assert hypernym_paths(sample) == [[example, information]] + assert hypernym_paths(random_sample) == [[sample, example, information]] + + +@pytest.mark.usefixtures('mini_db') +def test_interlingual_hypernym_paths(): + información = wn.synsets('información')[0] + ejemplo = wn.synsets('ejemplo')[0] + inferred = wn.Synset.empty('*INFERRED*') + muestra_aleatoria = wn.synsets('muestra aleatoria')[0] + assert hypernym_paths(información) == [] + assert hypernym_paths(ejemplo) == [[información]] + assert hypernym_paths(muestra_aleatoria) == [[inferred, ejemplo, información]] + + +@pytest.mark.usefixtures('mini_db') +def test_shortest_path(): + information = wn.synsets('information')[0] + example = wn.synsets('example')[0] + sample = wn.synsets('sample')[0] + random_sample = wn.synsets('random sample')[0] + datum = wn.synsets('datum')[0] + exemplify = wn.synsets('exemplify')[0] + inferred_root = wn.Synset.empty('*INFERRED*') + assert shortest_path(information, information) == [] + assert shortest_path(information, datum) == [datum] + assert shortest_path(information, sample) == [example, sample] + assert shortest_path(sample, information) == [example, information] + assert shortest_path(random_sample, datum) == [sample, example, information, datum] + with pytest.raises(wn.Error): + shortest_path(example, exemplify) + assert shortest_path(example, exemplify, simulate_root=True) == [ + information, inferred_root, exemplify + ] + + +@pytest.mark.usefixtures('mini_db') +def test_min_depth(): + assert min_depth(wn.synsets('information')[0]) == 0 + assert min_depth(wn.synsets('example')[0]) == 1 + assert min_depth(wn.synsets('sample')[0]) == 2 + assert min_depth(wn.synsets('random sample')[0]) == 3 + + +@pytest.mark.usefixtures('mini_db') +def test_max_depth(): + assert max_depth(wn.synsets('information')[0]) == 0 + assert max_depth(wn.synsets('example')[0]) == 1 + assert max_depth(wn.synsets('sample')[0]) == 2 + assert max_depth(wn.synsets('random sample')[0]) == 3 diff --git a/wn/_core.py b/wn/_core.py index 8ee1d18..1aa4186 100644 --- a/wn/_core.py +++ b/wn/_core.py @@ -19,7 +19,7 @@ NormalizeFunction, LemmatizeFunction, ) -from wn._util import flatten, normalize_form +from wn._util import normalize_form from wn._db import NON_ROWID from wn._queries import ( find_lexicons, @@ -50,8 +50,8 @@ get_sense_counts, get_lexfile, ) +from wn import taxonomy -_FAKE_ROOT = '*ROOT*' _INFERRED_SYNSET = '*INFERRED*' @@ -692,172 +692,41 @@ def _get_relations(self, args: Sequence[str]) -> List[Tuple[str, 'Synset']]: return targets - def _hypernym_paths( - self, simulate_root: bool, include_self: bool - ) -> List[List['Synset']]: - paths = list(self.relation_paths('hypernym', 'instance_hypernym')) - if include_self: - paths = [[self] + path for path in paths] or [[self]] - if simulate_root and self.id != _FAKE_ROOT: - root = Synset.empty( - id=_FAKE_ROOT, _lexid=self._lexid, _wordnet=self._wordnet - ) - paths = [path + [root] for path in paths] or [[root]] - return paths - def hypernym_paths(self, simulate_root: bool = False) -> List[List['Synset']]: - """Return the list of hypernym paths to a root synset. - - Example: - - >>> for path in wn.synsets('dog', pos='n')[0].hypernym_paths(): - ... for i, ss in enumerate(path): - ... print(' ' * i, ss, ss.lemmas()[0]) - ... - Synset('pwn-02083346-n') canine - Synset('pwn-02075296-n') carnivore - Synset('pwn-01886756-n') eutherian mammal - Synset('pwn-01861778-n') mammalian - Synset('pwn-01471682-n') craniate - Synset('pwn-01466257-n') chordate - Synset('pwn-00015388-n') animal - Synset('pwn-00004475-n') organism - Synset('pwn-00004258-n') animate thing - Synset('pwn-00003553-n') unit - Synset('pwn-00002684-n') object - Synset('pwn-00001930-n') physical entity - Synset('pwn-00001740-n') entity - Synset('pwn-01317541-n') domesticated animal - Synset('pwn-00015388-n') animal - Synset('pwn-00004475-n') organism - Synset('pwn-00004258-n') animate thing - Synset('pwn-00003553-n') unit - Synset('pwn-00002684-n') object - Synset('pwn-00001930-n') physical entity - Synset('pwn-00001740-n') entity - - """ - return self._hypernym_paths(simulate_root, False) + """Return the list of hypernym paths to a root synset.""" + return taxonomy.hypernym_paths(self, simulate_root=simulate_root) def min_depth(self, simulate_root: bool = False) -> int: - """Return the minimum taxonomy depth of the synset. - - Example: - - >>> wn.synsets('dog', pos='n')[0].min_depth() - 8 - - """ - return min( - (len(path) for path in self.hypernym_paths(simulate_root=simulate_root)), - default=0 - ) + """Return the minimum taxonomy depth of the synset.""" + return taxonomy.min_depth(self, simulate_root=simulate_root) def max_depth(self, simulate_root: bool = False) -> int: - """Return the maximum taxonomy depth of the synset. - - Example: - - >>> wn.synsets('dog', pos='n')[0].max_depth() - 13 - - """ - return max( - (len(path) for path in self.hypernym_paths(simulate_root=simulate_root)), - default=0 - ) - - def _shortest_hyp_paths( - self, other: 'Synset', simulate_root: bool - ) -> Dict[Tuple['Synset', int], List['Synset']]: - if self == other: - return {(self, 0): []} - - from_self = self._hypernym_paths(simulate_root, True) - from_other = other._hypernym_paths(simulate_root, True) - common = set(flatten(from_self)).intersection(flatten(from_other)) - - if not common: - return {} - - # Compute depths of common hypernyms from their distances. - # Doing this now avoid more expensive lookups later. - depths: Dict['Synset', int] = {} - # subpaths accumulates paths to common hypernyms from both sides - subpaths: Dict['Synset', Tuple[List[List['Synset']], List[List['Synset']]]] - subpaths = {ss: ([], []) for ss in common} - for which, paths in (0, from_self), (1, from_other): - for path in paths: - for dist, ss in enumerate(path): - if ss in common: - # self or other subpath to ss (not including ss) - subpaths[ss][which].append(path[:dist + 1]) - # keep maximum depth - depth = len(path) - dist - 1 - if ss not in depths or depths[ss] < depth: - depths[ss] = depth - - shortest: Dict[Tuple['Synset', int], List['Synset']] = {} - for ss in common: - from_self_subpaths, from_other_subpaths = subpaths[ss] - shortest_from_self = min(from_self_subpaths, key=len) - # for the other path, we need to reverse it and remove the pivot synset - shortest_from_other = min(from_other_subpaths, key=len)[-2::-1] - shortest[(ss, depths[ss])] = shortest_from_self + shortest_from_other - - return shortest + """Return the maximum taxonomy depth of the synset.""" + return taxonomy.max_depth(self, simulate_root=simulate_root) def shortest_path( self, other: 'Synset', simulate_root: bool = False ) -> List['Synset']: - """Return the shortest path from the synset to the *other* synset. - - Arguments: - other: endpoint synset of the path - simulate_root: if :python:`True`, ensure any two synsets - are always connected by positing a fake root node - - """ - pathmap = self._shortest_hyp_paths(other, simulate_root) - key = min(pathmap, key=lambda key: len(pathmap[key]), default=None) - if key is None: - raise wn.Error(f'no path between {self!r} and {other!r}') - return pathmap[key][1:] + """Return the shortest path from the synset to the *other* synset.""" + return taxonomy.shortest_path( + self, other, simulate_root=simulate_root + ) def common_hypernyms( self, other: 'Synset', simulate_root: bool = False ) -> List['Synset']: - """Return the common hypernyms for the current and *other* synsets. - - Arguments: - other: synset that is a hyponym of any shared hypernyms - simulate_root: if :python:`True`, ensure any two synsets - always share a hypernym by positing a fake root node - - """ - from_self = self._hypernym_paths(simulate_root, True) - from_other = other._hypernym_paths(simulate_root, True) - common = set(flatten(from_self)).intersection(flatten(from_other)) - return sorted(common) + """Return the common hypernyms for the current and *other* synsets.""" + return taxonomy.common_hypernyms( + self, other, simulate_root=simulate_root + ) def lowest_common_hypernyms( self, other: 'Synset', simulate_root: bool = False ) -> List['Synset']: - """Return the common hypernyms furthest from the root. - - Arguments: - other: synset that is a hyponym of any shared hypernyms - simulate_root: if :python:`True`, ensure any two synsets - always share a hypernym by positing a fake root node - - """ - pathmap = self._shortest_hyp_paths(other, simulate_root) - # keys of pathmap are (synset, depth_of_synset) - max_depth: int = max([depth for _, depth in pathmap], default=-1) - if max_depth == -1: - return [] - else: - return [ss for ss, d in pathmap if d == max_depth] + """Return the common hypernyms furthest from the root.""" + return taxonomy.lowest_common_hypernyms( + self, other, simulate_root=simulate_root + ) def holonyms(self) -> List['Synset']: """Return the list of synsets related by any holonym relation. @@ -932,7 +801,6 @@ def translate(self, lexicon: str = None, *, lang: str = None) -> List['Synset']: ['spider'] """ - ili = self._ili if not ili: return [] diff --git a/wn/taxonomy.py b/wn/taxonomy.py new file mode 100644 index 0000000..61894d5 --- /dev/null +++ b/wn/taxonomy.py @@ -0,0 +1,351 @@ + +"""Functions for working with hypernym/hyponym taxonomies.""" + +from typing import Optional, Tuple, List, Set, Dict, TYPE_CHECKING + +import wn +from wn.constants import ADJ, ADJ_SAT +from wn._util import flatten +from wn import _core + +if TYPE_CHECKING: + from wn._core import Wordnet, Synset + + +_FAKE_ROOT = '*ROOT*' + + +def roots(wordnet: 'Wordnet', pos: Optional[str] = None) -> List['Synset']: + """Return the list of root synsets in *wordnet*. + + Arguments: + + wordnet: The wordnet from which root synsets are found. + + pos: If given, only return synsets with the specified part of + speech. + + Example: + + >>> import wn, wn.taxonomy + >>> ewn = wn.Wordnet('ewn:2020') + >>> len(wn.taxonomy.roots(ewn, pos='v')) + 573 + + + """ + return [ss for ss in _synsets_for_pos(wordnet, pos) if not ss.hypernyms()] + + +def leaves(wordnet: 'Wordnet', pos: Optional[str] = None) -> List['Synset']: + """Return the list of leaf synsets in *wordnet*. + + Arguments: + + wordnet: The wordnet from which leaf synsets are found. + + pos: If given, only return synsets with the specified part of + speech. + + Example: + + >>> import wn, wn.taxonomy + >>> ewn = wn.Wordnet('ewn:2020') + >>> len(wn.taxonomy.leaves(ewn, pos='v')) + 10525 + + """ + return [ss for ss in _synsets_for_pos(wordnet, pos) if not ss.hyponyms()] + + +def taxonomy_depth(wordnet: 'Wordnet', pos: str) -> int: + """Return the list of leaf synsets in *wordnet*. + + Arguments: + + wordnet: The wordnet for which the taxonomy depth will be + calculated. + + pos: The part of speech for which the taxonomy depth will be + calculated. + + Example: + + >>> import wn, wn.taxonomy + >>> ewn = wn.Wordnet('ewn:2020') + >>> wn.taxonomy.taxonomy_depth(ewn, 'n') + 19 + + """ + seen: Set['Synset'] = set() + depth = 0 + for ss in _synsets_for_pos(wordnet, pos): + if all(hyp in seen for hyp in ss.hypernyms()): + continue + paths = ss.hypernym_paths() + if paths: + depth = max(depth, max(len(path) for path in paths)) + seen.update(hyp for path in paths for hyp in path) + return depth + + +def _synsets_for_pos(wordnet: 'Wordnet', pos: Optional[str]) -> List['Synset']: + """Get the list of synsets for a part of speech. If *pos* is 'a' or + 's', also include those for the other. + + """ + synsets = wordnet.synsets(pos=pos) + if pos == ADJ: + synsets.extend(wordnet.synsets(pos=ADJ_SAT)) + elif pos == ADJ_SAT: + synsets.extend(wordnet.synsets(pos=ADJ)) + return synsets + + +def _hypernym_paths( + synset: 'Synset', simulate_root: bool, include_self: bool +) -> List[List['Synset']]: + paths = list(synset.relation_paths('hypernym', 'instance_hypernym')) + if include_self: + paths = [[synset] + path for path in paths] or [[synset]] + if simulate_root and synset.id != _FAKE_ROOT: + root = _core.Synset.empty( + id=_FAKE_ROOT, _lexid=synset._lexid, _wordnet=synset._wordnet + ) + paths = [path + [root] for path in paths] or [[root]] + return paths + + +def hypernym_paths( + synset: 'Synset', + simulate_root: bool = False +) -> List[List['Synset']]: + """Return the list of hypernym paths to a root synset. + + Arguments: + + synset: The starting synset for paths to a root. + + simulate_root: If :python:`True`, find the path to a simulated + root node. + + Example: + + >>> import wn, wn.taxonomy + >>> dog = wn.synsets('dog', pos='n')[0] + >>> for path in wn.taxonomy.hypernym_paths(dog): + ... for i, ss in enumerate(path): + ... print(' ' * i, ss, ss.lemmas()[0]) + ... + Synset('pwn-02083346-n') canine + Synset('pwn-02075296-n') carnivore + Synset('pwn-01886756-n') eutherian mammal + Synset('pwn-01861778-n') mammalian + Synset('pwn-01471682-n') craniate + Synset('pwn-01466257-n') chordate + Synset('pwn-00015388-n') animal + Synset('pwn-00004475-n') organism + Synset('pwn-00004258-n') animate thing + Synset('pwn-00003553-n') unit + Synset('pwn-00002684-n') object + Synset('pwn-00001930-n') physical entity + Synset('pwn-00001740-n') entity + Synset('pwn-01317541-n') domesticated animal + Synset('pwn-00015388-n') animal + Synset('pwn-00004475-n') organism + Synset('pwn-00004258-n') animate thing + Synset('pwn-00003553-n') unit + Synset('pwn-00002684-n') object + Synset('pwn-00001930-n') physical entity + Synset('pwn-00001740-n') entity + + """ + return _hypernym_paths(synset, simulate_root, False) + + +def min_depth(synset: 'Synset', simulate_root: bool = False) -> int: + """Return the minimum taxonomy depth of the synset. + + Arguments: + + synset: The starting synset for paths to a root. + + simulate_root: If :python:`True`, find the depth to a + simulated root node. + + Example: + + >>> import wn, wn.taxonomy + >>> dog = wn.synsets('dog', pos='n')[0] + >>> wn.taxonomy.min_depth(dog) + 8 + + """ + return min( + (len(path) for path in synset.hypernym_paths(simulate_root=simulate_root)), + default=0 + ) + + +def max_depth(synset: 'Synset', simulate_root: bool = False) -> int: + """Return the maximum taxonomy depth of the synset. + + Arguments: + + synset: The starting synset for paths to a root. + + simulate_root: If :python:`True`, find the depth to a + simulated root node. + + Example: + + >>> import wn, wn.taxonomy + >>> dog = wn.synsets('dog', pos='n')[0] + >>> wn.taxonomy.max_depth(dog) + 13 + + """ + return max( + (len(path) for path in synset.hypernym_paths(simulate_root=simulate_root)), + default=0 + ) + + +def _shortest_hyp_paths( + synset: 'Synset', other: 'Synset', simulate_root: bool +) -> Dict[Tuple['Synset', int], List['Synset']]: + if synset == other: + return {(synset, 0): []} + + from_self = _hypernym_paths(synset, simulate_root, True) + from_other = _hypernym_paths(other, simulate_root, True) + common = set(flatten(from_self)).intersection(flatten(from_other)) + + if not common: + return {} + + # Compute depths of common hypernyms from their distances. + # Doing this now avoid more expensive lookups later. + depths: Dict['Synset', int] = {} + # subpaths accumulates paths to common hypernyms from both sides + subpaths: Dict['Synset', Tuple[List[List['Synset']], List[List['Synset']]]] + subpaths = {ss: ([], []) for ss in common} + for which, paths in (0, from_self), (1, from_other): + for path in paths: + for dist, ss in enumerate(path): + if ss in common: + # synset or other subpath to ss (not including ss) + subpaths[ss][which].append(path[:dist + 1]) + # keep maximum depth + depth = len(path) - dist - 1 + if ss not in depths or depths[ss] < depth: + depths[ss] = depth + + shortest: Dict[Tuple['Synset', int], List['Synset']] = {} + for ss in common: + from_self_subpaths, from_other_subpaths = subpaths[ss] + shortest_from_self = min(from_self_subpaths, key=len) + # for the other path, we need to reverse it and remove the pivot synset + shortest_from_other = min(from_other_subpaths, key=len)[-2::-1] + shortest[(ss, depths[ss])] = shortest_from_self + shortest_from_other + + return shortest + + +def shortest_path( + synset: 'Synset', other: 'Synset', simulate_root: bool = False +) -> List['Synset']: + """Return the shortest path from *synset* to the *other* synset. + + Arguments: + other: endpoint synset of the path + simulate_root: if :python:`True`, ensure any two synsets + are always connected by positing a fake root node + + Example: + + >>> import wn, wn.taxonomy + >>> dog = ewn.synsets('dog', pos='n')[0] + >>> squirrel = ewn.synsets('squirrel', pos='n')[0] + >>> for ss in wn.taxonomy.shortest_path(dog, squirrel): + ... print(ss.lemmas()) + ... + ['canine', 'canid'] + ['carnivore'] + ['eutherian mammal', 'placental', 'placental mammal', 'eutherian'] + ['rodent', 'gnawer'] + ['squirrel'] + + """ + pathmap = _shortest_hyp_paths(synset, other, simulate_root) + key = min(pathmap, key=lambda key: len(pathmap[key]), default=None) + if key is None: + raise wn.Error(f'no path between {synset!r} and {other!r}') + return pathmap[key][1:] + + +def common_hypernyms( + synset: 'Synset', other: 'Synset', simulate_root: bool = False +) -> List['Synset']: + """Return the common hypernyms for the current and *other* synsets. + + Arguments: + other: synset that is a hyponym of any shared hypernyms + simulate_root: if :python:`True`, ensure any two synsets + always share a hypernym by positing a fake root node + + Example: + + >>> import wn, wn.taxonomy + >>> dog = ewn.synsets('dog', pos='n')[0] + >>> squirrel = ewn.synsets('squirrel', pos='n')[0] + >>> for ss in wn.taxonomy.common_hypernyms(dog, squirrel): + ... print(ss.lemmas()) + ... + ['entity'] + ['physical entity'] + ['object', 'physical object'] + ['unit', 'whole'] + ['animate thing', 'living thing'] + ['organism', 'being'] + ['fauna', 'beast', 'animate being', 'brute', 'creature', 'animal'] + ['chordate'] + ['craniate', 'vertebrate'] + ['mammalian', 'mammal'] + ['eutherian mammal', 'placental', 'placental mammal', 'eutherian'] + + """ + from_self = _hypernym_paths(synset, simulate_root, True) + from_other = _hypernym_paths(other, simulate_root, True) + common = set(flatten(from_self)).intersection(flatten(from_other)) + return sorted(common) + + +def lowest_common_hypernyms( + synset: 'Synset', other: 'Synset', simulate_root: bool = False +) -> List['Synset']: + """Return the common hypernyms furthest from the root. + + Arguments: + other: synset that is a hyponym of any shared hypernyms + simulate_root: if :python:`True`, ensure any two synsets + always share a hypernym by positing a fake root node + + Example: + + >>> import wn, wn.taxonomy + >>> dog = ewn.synsets('dog', pos='n')[0] + >>> squirrel = ewn.synsets('squirrel', pos='n')[0] + >>> len(wn.taxonomy.lowest_common_hypernyms(dog, squirrel)) + 1 + >>> wn.taxonomy.lowest_common_hypernyms(dog, squirrel)[0].lemmas() + ['eutherian mammal', 'placental', 'placental mammal', 'eutherian'] + + """ + pathmap = _shortest_hyp_paths(synset, other, simulate_root) + # keys of pathmap are (synset, depth_of_synset) + max_depth: int = max([depth for _, depth in pathmap], default=-1) + if max_depth == -1: + return [] + else: + return [ss for ss, d in pathmap if d == max_depth] From 545f7e229dcd9b41120c4a0235f11465c4e9cb0d Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 12:54:01 +0800 Subject: [PATCH 09/20] Improve documentation for similarity metrics --- docs/api/wn.similarity.rst | 79 ++++++++++++++++++++++++++++++- wn/similarity.py | 95 ++++++++++++++++++++++++++++++++------ 2 files changed, 160 insertions(+), 14 deletions(-) diff --git a/docs/api/wn.similarity.rst b/docs/api/wn.similarity.rst index b031776..b3e9c7a 100644 --- a/docs/api/wn.similarity.rst +++ b/docs/api/wn.similarity.rst @@ -3,6 +3,83 @@ wn.similarity .. automodule:: wn.similarity +Taxonomy-based Metrics +---------------------- + +The `Path `_, `Leacock-Chodorow `_, and `Wu-Palmer `_ similarity +metrics work by finding path distances in the hypernym/hyponym +taxonomy. As such, they are most useful when the synsets are, in fact, +arranged in a taxonomy. For the Princeton WordNet and derivative +wordnets, synsets for nouns and verbs are arranged taxonomically: the +nouns mostly form a single structure with a single root while verbs +form many smaller structures with many roots. Synsets for the other +parts of speech do not use hypernym/hyponym relations at all. This +situation may be different for other wordnet projects or future +versions of the English wordnets. + +The similarity metrics tend to fail when the synsets are not connected +by some path. When the synsets are in different parts of speech, or +even in separate lexicons, this failure is acceptable and +expected. But for cases like the verbs in the Princeton WordNet, it +might be more useful to pretend that there is some unique root for all +verbs so as to create a path connecting any two of them. For this +purpose, the *simulate_root* parameter is available on the +:func:`path`, :func:`lch`, and :func:`wup` functions, where it is +passed on to calls to :meth:`wn.Synset.shortest_path` and +:meth:`wn.Synset.lowest_common_hypernyms`. Setting *simulate_root* to +:python:`True` can, however, give surprising results if the words are +from a different lexicon. Currently, computing similarity for synsets +from a different part of speech raises an error. + + +Path Similarity +''''''''''''''' + +When :math:`p` is the length of the shortest path between two synsets, +the path similarity is: + +.. math:: + + \frac{1}{p + 1} + +The similarity score ranges between 0.0 and 1.0, where the higher the +score is, the more similar the synsets are. The score is 1.0 when a +synset is compared to itself, and 0.0 when there is no path between +the two synsets (i.e., the path distance is infinite). + .. autofunction:: path -.. autofunction:: wup + + +.. _leacock-chodorow-similarity: + +Leacock-Chodorow Similarity +''''''''''''''''''''''''''' + +When :math:`p` is the length of the shortest path between two synsets +and :math:`d` is the maximum taxonomy depth, the Leacock-Chodorow +similarity is: + +.. math:: + + -\text{log}(\frac{p + 1}{2d}) + .. autofunction:: lch + + +Wu-Palmer Similarity +'''''''''''''''''''' + +When *LCS* is the lowest common hypernym (also called "least common +subsumer") between two synsets, :math:`i` is the shortest path +distance from the first synset to *LCS*, :math:`j` is the shortest +path distance from the second synset to *LCS*, and :math:`k` is the +number of nodes (distance + 1) from *LCS* to the root node, then the +Wu-Palmer similarity is: + +.. math:: + + \frac{2k}{i + j + 2k} + +.. autofunction:: wup + diff --git a/wn/similarity.py b/wn/similarity.py index a87dbd3..5a9dc93 100644 --- a/wn/similarity.py +++ b/wn/similarity.py @@ -1,4 +1,6 @@ +"""Synset similarity metrics.""" + import math import wn @@ -8,23 +10,61 @@ def path(synset1, synset2): """Return the Path similarity of *synset1* and *synset2*. - When :math:`d` is the length of the shortest path from *synset1* - to *synset2*, the path similarity is: :math:`\\frac{1}{d + 1}` + Arguments: + synset1: The first synset to compare. + synset2: The second synset to compare. + simulate_root: When :python:`True`, a fake root node connects + all other roots; default: :python:`False`. - """ - distance = len(synset1.shortest_path(synset2, simulate_root=True)) + Example: + >>> import wn + >>> from wn.similarity import path + >>> ewn = wn.Wordnet('ewn:2020') + >>> spatula = ewn.synsets('spatula')[0] + >>> path(spatula, ewn.synsets('pancake')[0]) + 0.058823529411764705 + >>> path(spatula, ewn.synsets('utensil')[0]) + 0.2 + >>> path(spatula, spatula) + 1.0 + >>> flip = ewn.synsets('flip', pos='v')[0] + >>> turn_over = ewn.synsets('turn over', pos='v')[0] + >>> path(flip, turn_over) + 0.0 + >>> path(flip, turn_over, simulate_root=True) + 0.16666666666666666 + + """ return 1 / (distance + 1) def wup(synset1: Synset, synset2: Synset) -> float: """Return the Wu-Palmer similarity of *synset1* and *synset2*. - When *lch* is the lowest common hypernym for *synset1* and - *synset2*, *n1* is the shortest path distance from *synset1* to - *lch*, *n2* is the shortest path distance from *synset2* to *lch*, - and *n3* is the number of nodes (distance + 1) from *lch* to the - root node, then the Wu-Palmer similarity is: - :math:`\\frac{2(n3)}{n1 + n2 + 2(n3)}` + Arguments: + synset1: The first synset to compare. + synset2: The second synset to compare. + simulate_root: When :python:`True`, a fake root node connects + all other roots; default: :python:`False`. + + Raises: + wn.Error: When no path connects the *synset1* and *synset2*. + + Example: + >>> import wn + >>> from wn.similarity import wup + >>> ewn = wn.Wordnet('ewn:2020') + >>> spatula = ewn.synsets('spatula')[0] + >>> wup(spatula, ewn.synsets('pancake')[0]) + 0.2 + >>> wup(spatula, ewn.synsets('utensil')[0]) + 0.8 + >>> wup(spatula, spatula) + 1.0 + >>> flip = ewn.synsets('flip', pos='v')[0] + >>> turn_over = ewn.synsets('turn over', pos='v')[0] + >>> wup(flip, turn_over, simulate_root=True) + 0.2857142857142857 """ lch = synset1.lowest_common_hypernyms(synset2, simulate_root=True)[0] @@ -32,11 +72,40 @@ def wup(synset1: Synset, synset2: Synset) -> float: n1 = len(synset1.shortest_path(lch, simulate_root=True)) n2 = len(synset2.shortest_path(lch, simulate_root=True)) return (2 * n3) / (n1 + n2 + 2 * n3) +def lch( + synset1: Synset, + synset2: Synset, + max_depth: int, + simulate_root: bool = False +) -> float: + """Return the Leacock-Chodorow similarity between *synset1* and *synset2*. + + Arguments: + synset1: The first synset to compare. + synset2: The second synset to compare. + max_depth: The taxonomy depth (see :func:`wn.taxonomy.taxonomy_depth`) + simulate_root: When :python:`True`, a fake root node connects + all other roots; default: :python:`False`. + Example: + >>> import wn, wn.taxonomy + >>> from wn.similarity import lch + >>> ewn = wn.Wordnet('ewn:2020') + >>> n_depth = wn.taxonomy.taxonomy_depth(ewn, 'n') + >>> spatula = ewn.synsets('spatula')[0] + >>> lch(spatula, ewn.synsets('pancake')[0], n_depth) + 0.8043728156701697 + >>> lch(spatula, ewn.synsets('utensil')[0], n_depth) + 2.0281482472922856 + >>> lch(spatula, spatula, n_depth) + 3.6375861597263857 + >>> v_depth = taxonomy.taxonomy_depth(ewn, 'v') + >>> flip = ewn.synsets('flip', pos='v')[0] + >>> turn_over = ewn.synsets('turn over', pos='v')[0] + >>> lch(flip, turn_over, v_depth, simulate_root=True) + 1.3862943611198906 -def lch(synset1: Synset, synset2: Synset, max_depth: int = 0) -> float: - """Return the Leacock-Chodorow similarity of *synset1* and *synset2*.""" - distance = len(synset1.shortest_path(synset2, simulate_root=True)) + """ if max_depth <= 0: raise wn.Error('max_depth must be greater than 0') return -math.log((distance + 1) / (2 * max_depth)) From 0e44b93d28b57395d9fe464d676ac1f198f2d8e1 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 13:55:27 +0800 Subject: [PATCH 10/20] Similarity metrics raise error on different pos This commit also improves tests and type annotations for the similarity metrics. --- CHANGELOG.md | 2 ++ tests/data/mini-lmf-1.0.xml | 1 + tests/similarity_test.py | 52 +++++++++++++++++++++++++++++++++++-- wn/similarity.py | 34 +++++++++++++++++++----- 4 files changed, 80 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 027f194..24f4a8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ * Taxonomy methods on `wn.Synset` are moved to `wn.taxonomy`, but shortcut methods remain for compatibility ([#125]). +* Similarity metrics in `wn.similarity` now raise an error when + synsets come from different parts of speech. ## [v0.7.0] diff --git a/tests/data/mini-lmf-1.0.xml b/tests/data/mini-lmf-1.0.xml index 48663ac..b85e551 100644 --- a/tests/data/mini-lmf-1.0.xml +++ b/tests/data/mini-lmf-1.0.xml @@ -7,6 +7,7 @@ with the following words and hypernym/derivation relations: English: - information ⊃ (example, illustration) ⊃ sample ⊃ random sample - information ⊃ datum +- random sample (second synset) - example ⊳ exemplify - illustration ⊳ illustrate - resignate diff --git a/tests/similarity_test.py b/tests/similarity_test.py index 56fd2dd..a719d90 100644 --- a/tests/similarity_test.py +++ b/tests/similarity_test.py @@ -1,8 +1,11 @@ +from math import log + import pytest import wn from wn import similarity as sim +from wn.taxonomy import taxonomy_depth @pytest.mark.usefixtures('mini_db') @@ -11,6 +14,7 @@ def test_path(): example = wn.synsets('example')[0] sample = wn.synsets('sample')[0] random_sample = wn.synsets('random sample')[0] + random_sample2 = wn.synsets('random sample')[1] datum = wn.synsets('datum')[0] exemplify = wn.synsets('exemplify')[0] assert sim.path(information, information) == 1/1 @@ -18,7 +22,13 @@ def test_path(): assert sim.path(information, sample) == 1/3 assert sim.path(information, random_sample) == 1/4 assert sim.path(random_sample, datum) == 1/5 - assert sim.path(example, exemplify) == 1/4 + assert sim.path(random_sample2, datum) == 0 + assert sim.path(random_sample2, datum, simulate_root=True) == 1/4 + assert sim.path(random_sample, random_sample2, simulate_root=True) == 1/6 + with pytest.raises(wn.Error): + sim.path(example, exemplify) + with pytest.raises(wn.Error): + sim.wup(example, exemplify, simulate_root=True) @pytest.mark.usefixtures('mini_db') @@ -27,6 +37,7 @@ def test_wup(): example = wn.synsets('example')[0] sample = wn.synsets('sample')[0] random_sample = wn.synsets('random sample')[0] + random_sample2 = wn.synsets('random sample')[1] datum = wn.synsets('datum')[0] exemplify = wn.synsets('exemplify')[0] assert sim.wup(information, information) == (2*1) / (0+0+2*1) @@ -34,4 +45,41 @@ def test_wup(): assert sim.wup(information, sample) == (2*1) / (0+2+2*1) assert sim.wup(information, random_sample) == (2*1) / (0+3+2*1) assert sim.wup(random_sample, datum) == (2*1) / (3+1+2*1) - assert sim.wup(example, exemplify) == (2*1) / (2+1+2*1) + with pytest.raises(wn.Error): + assert sim.wup(random_sample2, datum) + assert (sim.wup(random_sample2, datum, simulate_root=True) + == (2*1) / (1+2+2*1)) + assert (sim.wup(random_sample, random_sample2, simulate_root=True) + == (2*1) / (4+1+2*1)) + with pytest.raises(wn.Error): + sim.wup(example, exemplify) + with pytest.raises(wn.Error): + sim.wup(example, exemplify, simulate_root=True) + + +@pytest.mark.usefixtures('mini_db') +def test_lch(): + w = wn.Wordnet('test-en') + d_n = taxonomy_depth(w, 'n') + information = w.synsets('information')[0] + example = w.synsets('example')[0] + sample = w.synsets('sample')[0] + random_sample = w.synsets('random sample')[0] + random_sample2 = wn.synsets('random sample')[1] + datum = w.synsets('datum')[0] + exemplify = w.synsets('exemplify')[0] + assert sim.lch(information, information, d_n) == -log((0+1) / (2*d_n)) + assert sim.lch(information, example, d_n) == -log((1+1) / (2*d_n)) + assert sim.lch(information, sample, d_n) == -log((2+1) / (2*d_n)) + assert sim.lch(information, random_sample, d_n) == -log((3+1) / (2*d_n)) + assert sim.lch(random_sample, datum, d_n) == -log((4+1) / (2*d_n)) + with pytest.raises(wn.Error): + assert sim.lch(random_sample2, datum, d_n) + assert (sim.lch(random_sample2, datum, d_n, simulate_root=True) + == -log((3+1) / (2*d_n))) + assert (sim.lch(random_sample, random_sample2, d_n, simulate_root=True) + == -log((5+1) / (2*d_n))) + with pytest.raises(wn.Error): + sim.lch(example, exemplify, d_n) + with pytest.raises(wn.Error): + sim.lch(example, exemplify, d_n, simulate_root=True) diff --git a/wn/similarity.py b/wn/similarity.py index 5a9dc93..cfaf201 100644 --- a/wn/similarity.py +++ b/wn/similarity.py @@ -4,10 +4,11 @@ import math import wn +from wn.constants import ADJ, ADJ_SAT from wn._core import Synset -def path(synset1, synset2): +def path(synset1: Synset, synset2: Synset, simulate_root: bool = False) -> float: """Return the Path similarity of *synset1* and *synset2*. Arguments: @@ -35,10 +36,17 @@ def path(synset1, synset2): 0.16666666666666666 """ + _check_if_pos_compatible(synset1.pos, synset2.pos) + try: + path = synset1.shortest_path(synset2, simulate_root=simulate_root) + except wn.Error: + distance = float('inf') + else: + distance = len(path) return 1 / (distance + 1) -def wup(synset1: Synset, synset2: Synset) -> float: +def wup(synset1: Synset, synset2: Synset, simulate_root=False) -> float: """Return the Wu-Palmer similarity of *synset1* and *synset2*. Arguments: @@ -67,11 +75,15 @@ def wup(synset1: Synset, synset2: Synset) -> float: 0.2857142857142857 """ - lch = synset1.lowest_common_hypernyms(synset2, simulate_root=True)[0] - n3 = lch.max_depth() + 1 - n1 = len(synset1.shortest_path(lch, simulate_root=True)) - n2 = len(synset2.shortest_path(lch, simulate_root=True)) - return (2 * n3) / (n1 + n2 + 2 * n3) + _check_if_pos_compatible(synset1.pos, synset2.pos) + lcs_list = _least_common_subsumers(synset1, synset2, simulate_root) + lcs = lcs_list[0] + i = len(synset1.shortest_path(lcs, simulate_root=simulate_root)) + j = len(synset2.shortest_path(lcs, simulate_root=simulate_root)) + k = lcs.max_depth() + 1 + return (2*k) / (i + j + 2*k) + + def lch( synset1: Synset, synset2: Synset, @@ -106,6 +118,14 @@ def lch( 1.3862943611198906 """ + _check_if_pos_compatible(synset1.pos, synset2.pos) + distance = len(synset1.shortest_path(synset2, simulate_root=simulate_root)) if max_depth <= 0: raise wn.Error('max_depth must be greater than 0') return -math.log((distance + 1) / (2 * max_depth)) + +def _check_if_pos_compatible(pos1: str, pos2: str) -> None: + _pos1 = ADJ if pos1 == ADJ_SAT else pos1 + _pos2 = ADJ if pos2 == ADJ_SAT else pos2 + if _pos1 != _pos2: + raise wn.Error('synsets must have the same part of speech') From b4c074b3c14eccfe470375ef3283be3e43b4e3d9 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 13:59:29 +0800 Subject: [PATCH 11/20] Close #122: Add Resnik similarity --- CHANGELOG.md | 2 ++ docs/api/wn.similarity.rst | 33 +++++++++++++++++++++++++- wn/similarity.py | 47 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24f4a8c..4fac245 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * `wn.ic` module ([#40] * `wn.taxonomy` module ([#125]) +* `wn.similarity.res` Resnik similarity ([#122]) ### Changed @@ -420,4 +421,5 @@ abandoned, but this is an entirely new codebase. [#115]: https://github.com/goodmami/wn/issues/115 [#116]: https://github.com/goodmami/wn/issues/116 [#117]: https://github.com/goodmami/wn/issues/117 +[#122]: https://github.com/goodmami/wn/issues/122 [#125]: https://github.com/goodmami/wn/issues/125 diff --git a/docs/api/wn.similarity.rst b/docs/api/wn.similarity.rst index b3e9c7a..1e3e5cc 100644 --- a/docs/api/wn.similarity.rst +++ b/docs/api/wn.similarity.rst @@ -62,7 +62,7 @@ similarity is: .. math:: - -\text{log}(\frac{p + 1}{2d}) + -\text{log}\left(\frac{p + 1}{2d}\right) .. autofunction:: lch @@ -83,3 +83,34 @@ Wu-Palmer similarity is: .. autofunction:: wup + +Information Content-based Metrics +--------------------------------- + +The `Resnik `_ similarity metric works by +computing the information content of the lowest common hypernyms of +the two synsets being compared. It therefore requires information +content weights (see :mod:`wn.ic`), and the value returned therefore +changes depending on the weights used. + + +Resnik Similarity +''''''''''''''''' + +The Resnik similarity is the maximum information content value of the +common subsumers (hypernym ancestors) of the two synsets. Formally it +is defined as follows, where :math:`c_1` and :math:`c_2` are the two +synsets being compared. + +.. math:: + + \text{max}_{c \in \text{S}(c_1, c_2)} \text{IC}(c) + +Since a synset's information content is always equal or greater than +the information content of its hypernyms, :math:`S(c_1, c_2)` above is +more efficiently computed using the lowest common hypernyms instead of +all common hypernyms. + +.. autofunction:: res + + diff --git a/wn/similarity.py b/wn/similarity.py index cfaf201..3bddc56 100644 --- a/wn/similarity.py +++ b/wn/similarity.py @@ -1,11 +1,13 @@ """Synset similarity metrics.""" +from typing import List import math import wn from wn.constants import ADJ, ADJ_SAT from wn._core import Synset +from wn.ic import Freq, information_content def path(synset1: Synset, synset2: Synset, simulate_root: bool = False) -> float: @@ -124,6 +126,51 @@ def lch( raise wn.Error('max_depth must be greater than 0') return -math.log((distance + 1) / (2 * max_depth)) + +def res(synset1: Synset, synset2: Synset, ic: Freq) -> float: + """Return the Resnik similarity between *synset1* and *synset2*. + + Arguments: + synset1: The first synset to compare. + synset2: The second synset to compare. + ic: Information Content weights. + + Example: + >>> import wn, wn.ic, wn.taxonomy + >>> from wn.similarity import res + >>> pwn = wn.Wordnet('pwn:3.0') + >>> ic = wn.ic.load('~/nltk_data/corpora/wordnet_ic/ic-brown.dat', pwn) + >>> spatula = pwn.synsets('spatula')[0] + >>> res(spatula, pwn.synsets('pancake')[0], ic) + 0.8017591149538994 + >>> res(spatula, pwn.synsets('utensil')[0], ic) + 5.87738923441087 + + """ + _check_if_pos_compatible(synset1.pos, synset2.pos) + lcs = _most_informative_lcs(synset1, synset2, ic) + return information_content(lcs, ic) + + +# Helper functions + +def _least_common_subsumers( + synset1: Synset, + synset2: Synset, + simulate_root: bool +) -> List[Synset]: + lcs = synset1.lowest_common_hypernyms(synset2, simulate_root=simulate_root) + if not lcs: + raise wn.Error(f'no common hypernyms for {synset1!r} and {synset2!r}') + return lcs + + +def _most_informative_lcs(synset1: Synset, synset2: Synset, ic: Freq) -> Synset: + pos_ic = ic[synset1.pos] + lcs = _least_common_subsumers(synset1, synset2, False) + return max(lcs, key=lambda ss: pos_ic[ss.id]) + + def _check_if_pos_compatible(pos1: str, pos2: str) -> None: _pos1 = ADJ if pos1 == ADJ_SAT else pos1 _pos2 = ADJ if pos2 == ADJ_SAT else pos2 From f762f497d96f3cd949d8556e92db53a9f8d1ecf7 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 14:43:06 +0800 Subject: [PATCH 12/20] Close #123: Add Jiang-Conrath similarity --- CHANGELOG.md | 2 ++ docs/api/wn.similarity.rst | 21 +++++++++++++++++++++ wn/similarity.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fac245..472f536 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * `wn.ic` module ([#40] * `wn.taxonomy` module ([#125]) * `wn.similarity.res` Resnik similarity ([#122]) +* `wn.similarity.jcn` Jiang-Conrath similarity ([#123]) ### Changed @@ -422,4 +423,5 @@ abandoned, but this is an entirely new codebase. [#116]: https://github.com/goodmami/wn/issues/116 [#117]: https://github.com/goodmami/wn/issues/117 [#122]: https://github.com/goodmami/wn/issues/122 +[#123]: https://github.com/goodmami/wn/issues/123 [#125]: https://github.com/goodmami/wn/issues/125 diff --git a/docs/api/wn.similarity.rst b/docs/api/wn.similarity.rst index 1e3e5cc..da2bc08 100644 --- a/docs/api/wn.similarity.rst +++ b/docs/api/wn.similarity.rst @@ -114,3 +114,24 @@ all common hypernyms. .. autofunction:: res +Jiang-Conrath Similarity +'''''''''''''''''''''''' + +The Jiang-Conrath similarity metric (`link to paper +`_) combines the ideas +of the taxonomy-based and information content-based metrics. It is +defined as follows, where :math:`c_1` and :math:`c_2` are the two +synsets being compared and :math:`c_0` is the lowest common hypernym +of the two with the highest information content weight: + +.. math:: + + \frac{1}{\text{IC}(c_1) + \text{IC}(c_2) + \text{IC}(c_0)} + +This equation is the simplified form given in the paper were several +parameterized terms are cancelled out because the full form is not +often used in practice. + +.. autofunction:: jcn + + diff --git a/wn/similarity.py b/wn/similarity.py index 3bddc56..39e1d06 100644 --- a/wn/similarity.py +++ b/wn/similarity.py @@ -152,6 +152,39 @@ def res(synset1: Synset, synset2: Synset, ic: Freq) -> float: return information_content(lcs, ic) +def jcn(synset1: Synset, synset2: Synset, ic: Freq) -> float: + """Return the Jiang-Conrath similarity of two synsets. + + Arguments: + synset1: The first synset to compare. + synset2: The second synset to compare. + ic: Information Content weights. + + Example: + >>> import wn, wn.ic, wn.taxonomy + >>> from wn.similarity import jcn + >>> pwn = wn.Wordnet('pwn:3.0') + >>> ic = wn.ic.load('~/nltk_data/corpora/wordnet_ic/ic-brown.dat', pwn) + >>> spatula = pwn.synsets('spatula')[0] + >>> jcn(spatula, pwn.synsets('pancake')[0], ic) + 0.04061799236354239 + >>> jcn(spatula, pwn.synsets('utensil')[0], ic) + 0.10794048564613007 + + """ + _check_if_pos_compatible(synset1.pos, synset2.pos) + ic1 = information_content(synset1, ic) + ic2 = information_content(synset2, ic) + lcs = _most_informative_lcs(synset1, synset2, ic) + ic_lcs = information_content(lcs, ic) + if ic1 == ic2 == ic_lcs == 0: + return 0 + elif ic1 + ic2 == 2 * ic_lcs: + return float('inf') + else: + return 1 / (ic1 + ic2 - 2 * ic_lcs) + + # Helper functions def _least_common_subsumers( From 87b898548868b86a443ddef762d2e252f5eacec3 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 14:55:57 +0800 Subject: [PATCH 13/20] Close #124: Add Lin similarity --- CHANGELOG.md | 2 ++ docs/api/wn.similarity.rst | 14 ++++++++++++++ wn/similarity.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 472f536..f641af7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * `wn.taxonomy` module ([#125]) * `wn.similarity.res` Resnik similarity ([#122]) * `wn.similarity.jcn` Jiang-Conrath similarity ([#123]) +* `wn.similarity.lin` Lin similarity ([#124]) ### Changed @@ -424,4 +425,5 @@ abandoned, but this is an entirely new codebase. [#117]: https://github.com/goodmami/wn/issues/117 [#122]: https://github.com/goodmami/wn/issues/122 [#123]: https://github.com/goodmami/wn/issues/123 +[#124]: https://github.com/goodmami/wn/issues/124 [#125]: https://github.com/goodmami/wn/issues/125 diff --git a/docs/api/wn.similarity.rst b/docs/api/wn.similarity.rst index da2bc08..f901078 100644 --- a/docs/api/wn.similarity.rst +++ b/docs/api/wn.similarity.rst @@ -135,3 +135,17 @@ often used in practice. .. autofunction:: jcn +Lin Similarity +'''''''''''''' + +Another formulation of information content-based similarity is the Lin +metric (`Lin 1997 `_), +which is defined as follows, where :math:`c_1` and :math:`c_2` are the +two synsets being compared and :math:`c_0` is the lowest common +hypernym with the highest information content weight: + +.. math:: + + \frac{2(\text{IC}(c_0))}{\text{IC}(c_1) + \text{IC}(c_0)} + +.. autofunction:: lin diff --git a/wn/similarity.py b/wn/similarity.py index 39e1d06..976fe40 100644 --- a/wn/similarity.py +++ b/wn/similarity.py @@ -185,6 +185,35 @@ def jcn(synset1: Synset, synset2: Synset, ic: Freq) -> float: return 1 / (ic1 + ic2 - 2 * ic_lcs) +def lin(synset1: Synset, synset2: Synset, ic: Freq) -> float: + """Return the Lin similarity of two synsets. + + Arguments: + synset1: The first synset to compare. + synset2: The second synset to compare. + ic: Information Content weights. + + Example: + >>> import wn, wn.ic, wn.taxonomy + >>> from wn.similarity import lin + >>> pwn = wn.Wordnet('pwn:3.0') + >>> ic = wn.ic.load('~/nltk_data/corpora/wordnet_ic/ic-brown.dat', pwn) + >>> spatula = pwn.synsets('spatula')[0] + >>> lin(spatula, pwn.synsets('pancake')[0], ic) + 0.061148956278604116 + >>> lin(spatula, pwn.synsets('utensil')[0], ic) + 0.5592415686750427 + + """ + _check_if_pos_compatible(synset1.pos, synset2.pos) + lcs = _most_informative_lcs(synset1, synset2, ic) + ic1 = information_content(synset1, ic) + ic2 = information_content(synset2, ic) + if ic1 == 0 or ic2 == 0: + return 0.0 + return 2 * information_content(lcs, ic) / (ic1 + ic2) + + # Helper functions def _least_common_subsumers( From b3e94227cf39d3d5ed7f6a7d27c408c062ce0f34 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 14:56:12 +0800 Subject: [PATCH 14/20] Fix links to Resnik, Jiang-Conrath papers --- docs/api/wn.similarity.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/api/wn.similarity.rst b/docs/api/wn.similarity.rst index f901078..c18bb46 100644 --- a/docs/api/wn.similarity.rst +++ b/docs/api/wn.similarity.rst @@ -97,10 +97,11 @@ changes depending on the weights used. Resnik Similarity ''''''''''''''''' -The Resnik similarity is the maximum information content value of the -common subsumers (hypernym ancestors) of the two synsets. Formally it -is defined as follows, where :math:`c_1` and :math:`c_2` are the two -synsets being compared. +The Resnik similarity (`Resnik 1995 +`_) is the maximum +information content value of the common subsumers (hypernym ancestors) +of the two synsets. Formally it is defined as follows, where +:math:`c_1` and :math:`c_2` are the two synsets being compared. .. math:: @@ -117,7 +118,7 @@ all common hypernyms. Jiang-Conrath Similarity '''''''''''''''''''''''' -The Jiang-Conrath similarity metric (`link to paper +The Jiang-Conrath similarity metric (`Jiang and Conrath, 1997 `_) combines the ideas of the taxonomy-based and information content-based metrics. It is defined as follows, where :math:`c_1` and :math:`c_2` are the two From 4f37711ad890025627625263a73c23e2f5f6c132 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 15:02:19 +0800 Subject: [PATCH 15/20] Fix linting errors. --- wn/taxonomy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wn/taxonomy.py b/wn/taxonomy.py index 61894d5..08bab62 100644 --- a/wn/taxonomy.py +++ b/wn/taxonomy.py @@ -269,7 +269,7 @@ def shortest_path( >>> squirrel = ewn.synsets('squirrel', pos='n')[0] >>> for ss in wn.taxonomy.shortest_path(dog, squirrel): ... print(ss.lemmas()) - ... + ... ['canine', 'canid'] ['carnivore'] ['eutherian mammal', 'placental', 'placental mammal', 'eutherian'] @@ -301,7 +301,7 @@ def common_hypernyms( >>> squirrel = ewn.synsets('squirrel', pos='n')[0] >>> for ss in wn.taxonomy.common_hypernyms(dog, squirrel): ... print(ss.lemmas()) - ... + ... ['entity'] ['physical entity'] ['object', 'physical object'] From 0dcb0bcc0db59c5e048e290f0bf16731b141a9b8 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Thu, 24 Jun 2021 15:06:45 +0800 Subject: [PATCH 16/20] Update documentation header for IC similarity --- docs/api/wn.similarity.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/api/wn.similarity.rst b/docs/api/wn.similarity.rst index c18bb46..c70682c 100644 --- a/docs/api/wn.similarity.rst +++ b/docs/api/wn.similarity.rst @@ -87,11 +87,12 @@ Wu-Palmer similarity is: Information Content-based Metrics --------------------------------- -The `Resnik `_ similarity metric works by -computing the information content of the lowest common hypernyms of -the two synsets being compared. It therefore requires information -content weights (see :mod:`wn.ic`), and the value returned therefore -changes depending on the weights used. +The `Resnik `_, `Jiang-Conrath `_, and `Lin `_ similarity metrics work +by computing the information content of the synsets and/or that of +their lowest common hypernyms. They therefore require information +content weights (see :mod:`wn.ic`), and the values returned +necessarily depend on the weights used. Resnik Similarity From c034daaa104df6cb3876dcdbcdf1a3aa6dae4b2d Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sat, 3 Jul 2021 01:03:23 +0800 Subject: [PATCH 17/20] Add tests and improve docs for IC similarity --- docs/api/wn.similarity.rst | 18 +++- tests/ic_test.py | 19 ++++ tests/similarity_test.py | 182 ++++++++++++++++++++++++++----------- 3 files changed, 167 insertions(+), 52 deletions(-) diff --git a/docs/api/wn.similarity.rst b/docs/api/wn.similarity.rst index c70682c..9547a7b 100644 --- a/docs/api/wn.similarity.rst +++ b/docs/api/wn.similarity.rst @@ -128,12 +128,25 @@ of the two with the highest information content weight: .. math:: - \frac{1}{\text{IC}(c_1) + \text{IC}(c_2) + \text{IC}(c_0)} + \frac{1}{\text{IC}(c_1) + \text{IC}(c_2) - 2(\text{IC}(c_0))} This equation is the simplified form given in the paper were several parameterized terms are cancelled out because the full form is not often used in practice. +There are two special cases: + +1. If the information content of :math:`c_0`, :math:`c_1`, and + :math:`c_2` are all zero, the metric returns zero. This occurs when + both :math:`c_1` and :math:`c_2` are the root node, but it can also + occur if the synsets did not occur in the corpus and the smoothing + value was set to zero. + +2. Otherwise if :math:`c_1 + c_2 = 2c_0`, the metric returns + infinity. This occurs when the two synsets are the same, one is a + descendant of the other, etc., such that they have the same + frequency as each other and as their lowest common hypernym. + .. autofunction:: jcn @@ -150,4 +163,7 @@ hypernym with the highest information content weight: \frac{2(\text{IC}(c_0))}{\text{IC}(c_1) + \text{IC}(c_0)} +One special case is if either synset has an information content value +of zero, in which case the metric returns zero. + .. autofunction:: lin diff --git a/tests/ic_test.py b/tests/ic_test.py index 0f299fe..cf9d3eb 100644 --- a/tests/ic_test.py +++ b/tests/ic_test.py @@ -1,4 +1,6 @@ +from math import log + import pytest import wn @@ -114,3 +116,20 @@ def test_load(tmp_path): get_synset_id = synset_id_formatter('test-en-{offset:04}-{pos}') assert (wn.ic.load(icpath, w, get_synset_id=get_synset_id) == wn.ic.compute(words, w, distribute_weight=False, smoothing=0.0)) + + +@pytest.mark.usefixtures('mini_db') +def test_information_content(): + w = wn.Wordnet('test-en:1') + ic = wn.ic.compute(words, w) + info = w.synsets('information')[0] + samp = w.synsets('sample')[0] + # info is a root but not the only one, so its IC is not 0.0 + assert wn.ic.information_content(info, ic) == -log( + ic['n'][info.id] + / ic['n'][None] + ) + assert wn.ic.information_content(samp, ic) == -log( + ic['n'][samp.id] + / ic['n'][None] + ) diff --git a/tests/similarity_test.py b/tests/similarity_test.py index a719d90..94257e7 100644 --- a/tests/similarity_test.py +++ b/tests/similarity_test.py @@ -6,80 +6,160 @@ import wn from wn import similarity as sim from wn.taxonomy import taxonomy_depth +from wn.ic import information_content as infocont + + +def get_synsets(w): + return { + 'information': w.synset('test-en-0001-n'), + 'example': w.synset('test-en-0002-n'), + 'sample': w.synset('test-en-0004-n'), + 'random sample': w.synset('test-en-0005-n'), + 'random sample2': w.synset('test-en-0008-n'), + 'datum': w.synset('test-en-0006-n'), + 'exemplify': w.synset('test-en-0003-v'), + } + + +# some fake information content; computed using: +# words = ['example', 'example', 'sample', 'random sample', 'illustrate'] +# ic = compute(words, wn.Wordnet('test-en'), distribute_weight=False) + +ic = { + 'n': {'test-en-0001-n': 5.0, # information + 'test-en-0002-n': 5.0, # example, illustration + 'test-en-0004-n': 3.0, # sample + 'test-en-0005-n': 2.0, # random sample + 'test-en-0008-n': 2.0, # random sample 2 + 'test-en-0006-n': 1.0, # datum + None: 6.0}, + 'v': {'test-en-0003-v': 2.0, # exemplify, illustrate + 'test-en-0007-v': 1.0, # resignate + None: 2.0}, + 'a': {None: 1.0}, + 'r': {None: 1.0} +} @pytest.mark.usefixtures('mini_db') def test_path(): - information = wn.synsets('information')[0] - example = wn.synsets('example')[0] - sample = wn.synsets('sample')[0] - random_sample = wn.synsets('random sample')[0] - random_sample2 = wn.synsets('random sample')[1] - datum = wn.synsets('datum')[0] - exemplify = wn.synsets('exemplify')[0] - assert sim.path(information, information) == 1/1 - assert sim.path(information, example) == 1/2 - assert sim.path(information, sample) == 1/3 - assert sim.path(information, random_sample) == 1/4 - assert sim.path(random_sample, datum) == 1/5 - assert sim.path(random_sample2, datum) == 0 - assert sim.path(random_sample2, datum, simulate_root=True) == 1/4 - assert sim.path(random_sample, random_sample2, simulate_root=True) == 1/6 + ss = get_synsets(wn.Wordnet('test-en')) + assert sim.path(ss['information'], ss['information']) == 1/1 + assert sim.path(ss['information'], ss['example']) == 1/2 + assert sim.path(ss['information'], ss['sample']) == 1/3 + assert sim.path(ss['information'], ss['random sample']) == 1/4 + assert sim.path(ss['random sample'], ss['datum']) == 1/5 + assert sim.path(ss['random sample2'], ss['datum']) == 0 + assert sim.path(ss['random sample2'], ss['datum'], simulate_root=True) == 1/4 + assert sim.path(ss['random sample'], ss['random sample2'], simulate_root=True) == 1/6 with pytest.raises(wn.Error): - sim.path(example, exemplify) + sim.path(ss['example'], ss['exemplify']) with pytest.raises(wn.Error): - sim.wup(example, exemplify, simulate_root=True) + sim.wup(ss['example'], ss['exemplify'], simulate_root=True) @pytest.mark.usefixtures('mini_db') def test_wup(): - information = wn.synsets('information')[0] - example = wn.synsets('example')[0] - sample = wn.synsets('sample')[0] - random_sample = wn.synsets('random sample')[0] - random_sample2 = wn.synsets('random sample')[1] - datum = wn.synsets('datum')[0] - exemplify = wn.synsets('exemplify')[0] - assert sim.wup(information, information) == (2*1) / (0+0+2*1) - assert sim.wup(information, example) == (2*1) / (0+1+2*1) - assert sim.wup(information, sample) == (2*1) / (0+2+2*1) - assert sim.wup(information, random_sample) == (2*1) / (0+3+2*1) - assert sim.wup(random_sample, datum) == (2*1) / (3+1+2*1) + ss = get_synsets(wn.Wordnet('test-en')) + assert sim.wup(ss['information'], ss['information']) == (2*1) / (0+0+2*1) + assert sim.wup(ss['information'], ss['example']) == (2*1) / (0+1+2*1) + assert sim.wup(ss['information'], ss['sample']) == (2*1) / (0+2+2*1) + assert sim.wup(ss['information'], ss['random sample']) == (2*1) / (0+3+2*1) + assert sim.wup(ss['random sample'], ss['datum']) == (2*1) / (3+1+2*1) with pytest.raises(wn.Error): - assert sim.wup(random_sample2, datum) - assert (sim.wup(random_sample2, datum, simulate_root=True) + assert sim.wup(ss['random sample2'], ss['datum']) + assert (sim.wup(ss['random sample2'], ss['datum'], simulate_root=True) == (2*1) / (1+2+2*1)) - assert (sim.wup(random_sample, random_sample2, simulate_root=True) + assert (sim.wup(ss['random sample'], ss['random sample2'], simulate_root=True) == (2*1) / (4+1+2*1)) with pytest.raises(wn.Error): - sim.wup(example, exemplify) + sim.wup(ss['example'], ss['exemplify']) with pytest.raises(wn.Error): - sim.wup(example, exemplify, simulate_root=True) + sim.wup(ss['example'], ss['exemplify'], simulate_root=True) @pytest.mark.usefixtures('mini_db') def test_lch(): w = wn.Wordnet('test-en') + ss = get_synsets(w) d_n = taxonomy_depth(w, 'n') - information = w.synsets('information')[0] - example = w.synsets('example')[0] - sample = w.synsets('sample')[0] - random_sample = w.synsets('random sample')[0] - random_sample2 = wn.synsets('random sample')[1] - datum = w.synsets('datum')[0] - exemplify = w.synsets('exemplify')[0] - assert sim.lch(information, information, d_n) == -log((0+1) / (2*d_n)) - assert sim.lch(information, example, d_n) == -log((1+1) / (2*d_n)) - assert sim.lch(information, sample, d_n) == -log((2+1) / (2*d_n)) - assert sim.lch(information, random_sample, d_n) == -log((3+1) / (2*d_n)) - assert sim.lch(random_sample, datum, d_n) == -log((4+1) / (2*d_n)) + assert sim.lch(ss['information'], ss['information'], d_n) == -log((0+1) / (2*d_n)) + assert sim.lch(ss['information'], ss['example'], d_n) == -log((1+1) / (2*d_n)) + assert sim.lch(ss['information'], ss['sample'], d_n) == -log((2+1) / (2*d_n)) + assert sim.lch(ss['information'], ss['random sample'], d_n) == -log((3+1) / (2*d_n)) + assert sim.lch(ss['random sample'], ss['datum'], d_n) == -log((4+1) / (2*d_n)) with pytest.raises(wn.Error): - assert sim.lch(random_sample2, datum, d_n) - assert (sim.lch(random_sample2, datum, d_n, simulate_root=True) + assert sim.lch(ss['random sample2'], ss['datum'], d_n) + assert (sim.lch(ss['random sample2'], ss['datum'], d_n, simulate_root=True) == -log((3+1) / (2*d_n))) - assert (sim.lch(random_sample, random_sample2, d_n, simulate_root=True) + assert (sim.lch(ss['random sample'], ss['random sample2'], d_n, simulate_root=True) == -log((5+1) / (2*d_n))) with pytest.raises(wn.Error): - sim.lch(example, exemplify, d_n) + sim.lch(ss['example'], ss['exemplify'], d_n) + with pytest.raises(wn.Error): + sim.lch(ss['example'], ss['exemplify'], d_n, simulate_root=True) + + +@pytest.mark.usefixtures('mini_db') +def test_res(): + w = wn.Wordnet('test-en') + ss = get_synsets(w) + assert (sim.res(ss['information'], ss['information'], ic) + == infocont(ss['information'], ic)) + assert (sim.res(ss['information'], ss['example'], ic) + == infocont(ss['information'], ic)) + assert (sim.res(ss['information'], ss['sample'], ic) + == infocont(ss['information'], ic)) + assert (sim.res(ss['information'], ss['random sample'], ic) + == infocont(ss['information'], ic)) + assert (sim.res(ss['random sample'], ss['datum'], ic) + == infocont(ss['information'], ic)) + with pytest.raises(wn.Error): + sim.res(ss['random sample2'], ss['datum'], ic) + with pytest.raises(wn.Error): + sim.res(ss['example'], ss['exemplify'], ic) + + +@pytest.mark.usefixtures('mini_db') +def test_jcn(): + w = wn.Wordnet('test-en') + ss = get_synsets(w) + info_ic = infocont(ss['information'], ic) + assert (sim.jcn(ss['information'], ss['information'], ic) + == float('inf')) + assert (sim.jcn(ss['information'], ss['example'], ic) + == float('inf')) + assert (sim.jcn(ss['information'], ss['sample'], ic) + == 1 / ((info_ic + infocont(ss['sample'], ic)) - 2 * info_ic)) + assert (sim.jcn(ss['information'], ss['random sample'], ic) + == 1 / ((info_ic + infocont(ss['random sample'], ic)) - 2 * info_ic)) + assert (sim.jcn(ss['random sample'], ss['datum'], ic) + == 1 / ( + (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic)) + - 2 * info_ic)) + with pytest.raises(wn.Error): + sim.jcn(ss['random sample2'], ss['datum'], ic) + with pytest.raises(wn.Error): + sim.jcn(ss['example'], ss['exemplify'], ic) + + +@pytest.mark.usefixtures('mini_db') +def test_lin(): + w = wn.Wordnet('test-en') + ss = get_synsets(w) + info_ic = infocont(ss['information'], ic) + assert (sim.lin(ss['information'], ss['information'], ic) + == 1.0) + assert (sim.lin(ss['information'], ss['example'], ic) + == 1.0) + assert (sim.lin(ss['information'], ss['sample'], ic) + == (2 * info_ic) / (info_ic + infocont(ss['sample'], ic))) + assert (sim.lin(ss['information'], ss['random sample'], ic) + == (2 * info_ic) / (info_ic + infocont(ss['random sample'], ic))) + assert (sim.lin(ss['random sample'], ss['datum'], ic) + == ((2 * info_ic) + / (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic)))) + with pytest.raises(wn.Error): + sim.lin(ss['random sample2'], ss['datum'], ic) with pytest.raises(wn.Error): - sim.lch(example, exemplify, d_n, simulate_root=True) + sim.lin(ss['example'], ss['exemplify'], ic) From d1fd87a0e024c4a5dfa31d10e9677aa89a554870 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Sat, 3 Jul 2021 01:03:55 +0800 Subject: [PATCH 18/20] Add doc stubs for relations() on Sense/Synset --- docs/api/wn.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/api/wn.rst b/docs/api/wn.rst index 18c8a2a..3a004ab 100644 --- a/docs/api/wn.rst +++ b/docs/api/wn.rst @@ -168,6 +168,7 @@ The Sense Class .. automethod:: frames .. automethod:: counts .. automethod:: metadata + .. automethod:: relations .. automethod:: get_related .. automethod:: get_related_synsets .. automethod:: closure @@ -218,6 +219,7 @@ The Synset Class .. automethod:: hyponyms .. automethod:: holonyms .. automethod:: meronyms + .. automethod:: relations .. automethod:: get_related .. automethod:: closure .. automethod:: relation_paths From f48515f7dbc21f35902ad4f6a13ce66ac1ccf045 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 7 Jul 2021 00:17:37 +0800 Subject: [PATCH 19/20] Update CONTRIBUTING, README, CHANGELOG for v0.8.0 --- CHANGELOG.md | 7 +++++++ CONTRIBUTING.md | 46 ++++++++++++++++++++++++++++++++++++++-------- README.md | 7 ------- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f641af7..ec79b88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ * `wn.similarity.res` Resnik similarity ([#122]) * `wn.similarity.jcn` Jiang-Conrath similarity ([#123]) * `wn.similarity.lin` Lin similarity ([#124]) +* `wn.util.synset_id_formatter` ([#119]) ### Changed @@ -78,6 +79,10 @@ **Release date: 2021-03-04** +**Notice:** This release introduces backwards-incompatible changes to +the schema that require users upgrading from previous versions to +rebuild their database. + ### Added * For WN-LMF 1.0 support ([#65]) @@ -363,6 +368,7 @@ the https://github.com/nltk/wordnet/ code which had been effectively abandoned, but this is an entirely new codebase. +[v0.8.0]: ../../releases/tag/v0.8.0 [v0.7.0]: ../../releases/tag/v0.7.0 [v0.6.2]: ../../releases/tag/v0.6.2 [v0.6.1]: ../../releases/tag/v0.6.1 @@ -423,6 +429,7 @@ abandoned, but this is an entirely new codebase. [#115]: https://github.com/goodmami/wn/issues/115 [#116]: https://github.com/goodmami/wn/issues/116 [#117]: https://github.com/goodmami/wn/issues/117 +[#119]: https://github.com/goodmami/wn/issues/119 [#122]: https://github.com/goodmami/wn/issues/122 [#123]: https://github.com/goodmami/wn/issues/123 [#124]: https://github.com/goodmami/wn/issues/124 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4029259..daf1952 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,6 +5,7 @@ Thanks for helping to make Wn better! **Quick Links:** - [Report a bug or request a features](https://github.com/goodmami/wn/issues/new) +- [Ask a question](https://github.com/goodmami/wn/discussions) - [View documentation](https://wn.readthedocs.io/) **Developer Information:** @@ -14,20 +15,25 @@ Thanks for helping to make Wn better! - Changelog: [keep a changelog](https://keepachangelog.com/en/1.0.0/) - Documentation framework: [Sphinx](https://www.sphinx-doc.org/) - Docstring style: [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) (via [sphinx.ext.napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html)) -- Testing framework: [pytest](https://pytest.org/) -- Packaging framework: [flit](https://flit.readthedocs.io/en/latest/) -- Coding style: [PEP-8](https://www.python.org/dev/peps/pep-0008/) +- Testing automation: [nox](https://nox.thea.codes) +- Unit/regression testing: [pytest](https://pytest.org/) +- Packaging framework: [Flit](https://flit.readthedocs.io/en/latest/) +- Coding style: [PEP-8](https://www.python.org/dev/peps/pep-0008/) (via [Flake8](https://flake8.pycqa.org/)) - Type checking: [Mypy](http://mypy-lang.org/) ## Get Help -Confused about wordnets? See the [Global Wordnet Association -Documentation](https://globalwordnet.github.io/gwadoc/) +Confused about wordnets in general? See the [Global Wordnet +Association Documentation](https://globalwordnet.github.io/gwadoc/) -Having trouble with using Wn? [Raise an +Confused about using Wn or wish to share some tips? [Start a +discussion](https://github.com/goodmami/wn/discussions) + +Encountering a problem with Wn or wish to propose a new features? [Raise an issue](https://github.com/goodmami/wn/issues/new) + ## Report a Bug When reporting a bug, please provide enough information for someone to @@ -35,7 +41,17 @@ reproduce the problem. This might include the version of Python you're running, the version of Wn you have installed, the wordnet lexicons you have installed, and possibly the platform (Linux, Windows, macOS) you're on. Please give a minimal working example that illustrates the -problem. +problem. For example: + +> I'm using Wn 0.7.0 with Python 3.8 on Linux and [description of +> problem...]. Here's what I have tried: +> +> ```pycon +> >>> import wn +> >>> # some code +> ... # some result or error +> ``` + ## Request a Feature @@ -47,4 +63,18 @@ would address. See the "developer information" above for a brief description of guidelines and conventions used in Wn. If you have a fix, please -submit a pull request to the `main` branch. +submit a pull request to the `main` branch. In general, every pull +request should have an associated issue. + +Developers should install Wn locally from source using +[Flit](https://flit.readthedocs.io/en/latest/). Flit may be installed +system-wide or within a virtual environment: + +```bash +$ pip install flit +$ flit install -s +``` + +The `-s` option tells Flit to use symbolic links to install Wn, +similar to pip's -e editable installs. This allows one to edit source +files and use the changes without having to reinstall Wn each time. diff --git a/README.md b/README.md index ec12290..59821cb 100644 --- a/README.md +++ b/README.md @@ -19,13 +19,6 @@ --- -**Notice for users upgrading to v0.6:** Version v0.6.0 introduced -changes to the database schema that require the user to rebuild their -database. Please [raise an -issue](https://github.com/goodmami/wn/issues/new) if you need help. - ---- - Wn is a Python library for exploring information in wordnets. Install it from PyPI: From ec976619a035bb386b75ee9293d5632f99979306 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 7 Jul 2021 00:18:38 +0800 Subject: [PATCH 20/20] Bump version for v0.8.0 --- CHANGELOG.md | 4 ++++ wn/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec79b88..50955ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +## [v0.8.0] + +**Release date: 2021-07-07** + ### Added * `wn.ic` module ([#40] diff --git a/wn/__init__.py b/wn/__init__.py index 6135dbc..a42dbd2 100644 --- a/wn/__init__.py +++ b/wn/__init__.py @@ -49,4 +49,4 @@ Wordnet ) -__version__ = '0.7.0' +__version__ = '0.8.0'