Skip to content

Commit

Permalink
Pronunciations refactoring
Browse files Browse the repository at this point in the history
I reviewed completely how pronunciations were retrieved in order
to tackle that task, including changing where pronunciations were
looked for: before we were using the whole word wikicode, now we
only use `head_sections` wikicode (faster, and more efficient).

It has the following nice side-effects (on top the having
local-specific formatting like `\...\` for French, `[...]` for German, etc.):

- `EN`, `FR`: now supports multiple cross-sections pronunciations
  (like ones from Middle English combined to ones from Old English
  sections, for instance on English data)
- `FR`: pronunciations outside the French section are no more taken
  into account (like using values from Gaulois or other locale sections,
  that was incorrect)
- `PT`: a lot more pronunciations are correctly found now
- `SV`: multiple pronunciations are now supported
- `RU`: it will be way easier to tackle #1376 then
  • Loading branch information
BoboTiG committed Sep 2, 2022
1 parent 77cb802 commit 278c4c2
Show file tree
Hide file tree
Showing 29 changed files with 409 additions and 169 deletions.
17 changes: 0 additions & 17 deletions tests/test_2_render.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,6 @@
import pytest

from wikidict import render
from wikidict.lang import pronunciation


@pytest.mark.parametrize(
"regexp, code, expected",
[
(pronunciation["ca"], "{{ca-pron|/as/}}", ["as"]),
(pronunciation["ca"], "{{ca-pron|or=/əɫ/}}", ["əɫ"]),
(pronunciation["ca"], "{{ca-pron|or=/əɫ/|occ=/eɫ/}}", ["əɫ"]),
(pronunciation["ca"], "{{ca-pron|q=àton|or=/əɫ/|occ=/eɫ/|rima=}}", ["əɫ"]),
(pronunciation["en"], "{{IPA|en|/ʌs/}}", ["ʌs"]),
(pronunciation["en"], "{{IPA|en|/ʌs/}}, {{IPA|en|/ʌz/}}", ["ʌs", "ʌz"]),
(pronunciation["en"], "{{IPA|en|/ʌs/|/ʌz/}}", ["ʌs", "ʌz"]),
],
)
def test_find_pronunciations(regexp, code, expected):
assert render.find_pronunciations(code, regexp) == expected


def test_simple():
Expand Down
25 changes: 20 additions & 5 deletions tests/test_ca.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
import pytest

from wikidict.lang.ca import find_pronunciations
from wikidict.render import parse_word
from wikidict.utils import process_templates


@pytest.mark.parametrize(
"code, expected",
[
("", []),
("{{ca-pron|/as/}}", ["/as/"]),
("{{ca-pron|or=/əɫ/}}", ["/əɫ/"]),
("{{ca-pron|or=/əɫ/|occ=/eɫ/}}", ["/əɫ/"]),
("{{ca-pron|q=àton|or=/əɫ/|occ=/eɫ/|rima=}}", ["/əɫ/"]),
],
)
def test_find_pronunciations(code, expected):
assert find_pronunciations(code) == expected


@pytest.mark.parametrize(
"word, pronunciations, gender, etymology, definitions",
[
(
"-ass-",
["as"],
["/as/"],
"",
["Del sufix <i>-às</i> amb valor augmentatiu."],
["Infix que afegeix un matís augmentatiu."],
Expand All @@ -25,7 +40,7 @@
),
(
"AFI",
["ˈa.fi"],
["/ˈa.fi/"],
"",
["sigles"],
[
Expand Down Expand Up @@ -95,7 +110,7 @@
),
(
"cas",
["ˈkas"],
["/ˈkas/"],
"m",
["Del llatí <i>casus</i>."],
[
Expand Down Expand Up @@ -181,7 +196,7 @@
),
(
"el",
["əɫ"],
["/əɫ/"],
"f",
[],
[
Expand All @@ -200,7 +215,7 @@
["Cobert per a protegir plantes del vent o del fred extrem."],
),
("Mn.", [], "", [], ["mossèn com a tractament davant el nom"]),
("PMF", ["ˌpeˈe.məˌe.fə"], "", [], ["Preguntes Més Freqüents."]),
("PMF", ["/ˌpeˈe.məˌe.fə/"], "", [], ["Preguntes Més Freqüents."]),
("pen", [], "", [], []),
(
"si",
Expand Down
25 changes: 22 additions & 3 deletions tests/test_de.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,42 @@
import pytest

from wikidict.lang.de import find_pronunciations
from wikidict.render import parse_word
from wikidict.utils import process_templates


@pytest.mark.parametrize(
"code, expected",
[
("", []),
(
":{{IPA}} {{Lautschrift|ˈʁɪndɐˌsteːk}}",
["[ˈʁɪndɐˌsteːk]"],
),
(
":{{IPA}} {{Lautschrift|ˈʁɪndɐˌsteːk}}, {{Lautschrift|ˈʁɪndɐˌʃteːk}}, {{Lautschrift|ˈʁɪndɐˌsteɪ̯k}}",
["[ˈʁɪndɐˌsteːk]", "[ˈʁɪndɐˌʃteːk]", "[ˈʁɪndɐˌsteɪ̯k]"],
),
],
)
def test_find_pronunciations(code, expected):
assert find_pronunciations(code) == expected


@pytest.mark.parametrize(
"word, pronunciations, gender, etymology, definitions, variants",
[
(
"CIA",
["siːaɪ̯ˈɛɪ̯"],
["[siːaɪ̯ˈɛɪ̯]"],
"mf",
["Abkürzung von Central Intelligence Agency"],
["US-amerikanischer Auslandsnachrichtendienst"],
[],
),
(
"volley",
["ˈvɔli", "ˈvɔle", "ˈvɔlɛɪ̯"],
["[ˈvɔli]", "[ˈvɔle]", "[ˈvɔlɛɪ̯]"],
"",
[
"Dem seit 1960 im Duden lexikalisierten Wort liegt die englische Kollokation <i>at/on the <i>volley</i></i> ‚aus der Luft‘ zugrunde.", # noqa
Expand All @@ -27,7 +46,7 @@
],
[],
),
("trage", ["ˈtʁaːɡə"], "", [], [], ["tragen"]),
("trage", ["[ˈtʁaːɡə]"], "", [], [], ["tragen"]),
("daß", [], "", [], [], ["dass"]),
],
)
Expand Down
13 changes: 13 additions & 0 deletions tests/test_el.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
import pytest

from wikidict.lang.el import find_pronunciations
from wikidict.render import parse_word
from wikidict.utils import process_templates


@pytest.mark.parametrize(
"code, expected",
[
("", []),
("{{ΔΦΑ|tɾeˈlos|γλ=el}}", ["tɾeˈlos"]),
("{{ΔΦΑ|γλ=el|ˈni.xta}}", ["ˈni.xta"]),
],
)
def test_find_pronunciations(code, expected):
assert find_pronunciations(code) == expected


@pytest.mark.parametrize(
"word, pronunciations, gender, etymology, definitions, variants",
[
Expand Down
50 changes: 38 additions & 12 deletions tests/test_en.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,31 @@
import pytest

from wikidict.lang.en import find_pronunciations
from wikidict.render import parse_word
from wikidict.utils import process_templates


@pytest.mark.parametrize(
"code, expected",
[
("", []),
("{{IPA|en|/ʌs/}}", ["/ʌs/"]),
("{{IPA|en|/ʌs/|/ʌs/}}", ["/ʌs/"]),
("{{IPA|en|/ʌs/}} {{IPA|en|/ʌs/}}", ["/ʌs/"]),
("{{IPA|en|/ʌs/}}, {{IPA|en|/ʌz/}}", ["/ʌs/", "/ʌz/"]),
("{{IPA|en|/ʌs/|/ʌz/}}", ["/ʌs/", "/ʌz/"]),
],
)
def test_find_pronunciations(code, expected):
assert find_pronunciations(code) == expected


@pytest.mark.parametrize(
"word, pronunciations, etymology, definitions",
[
(
"ab",
["æb"],
["/æb/"],
["Abbreviation of <b>abdominal</b> <b>muscles</b>."],
[
"<i>(informal)</i> abdominal muscle. <small>[Mid 20<sup>th</sup> century.]</small>",
Expand All @@ -23,7 +39,7 @@
),
(
"cum",
["kʌm", "kʊm"],
["/kʌm/", "/kʊm/"],
["Learned borrowing from Latin <i>cum</i> (“with”)."],
[
"<i>Used in indicating a thing with two roles, functions, or natures, or a thing that has changed from one to another.</i>", # noqa
Expand All @@ -39,7 +55,7 @@
),
(
"efficient",
["ɪˈfɪʃənt"],
["/ɪˈfɪʃənt/", "/əˈfɪʃənt/"],
[
"1398, “making,” from Old French, from Latin <i>efficientem</i>, nominative <i>efficiēns</i>, participle of <i>efficere</i> (“work out, accomplish”) (see <b>effect</b>). Meaning “productive, skilled” is from 1787. <i>Efficiency apartment</i> is first recorded 1930, American English." # noqa
],
Expand All @@ -53,7 +69,7 @@
),
(
"it's",
["ɪts"],
["/ɪts/"],
["Contraction of ‘it is’ or ‘it has’."],
[
"<i>Contraction of</i> <b>it is</b>.",
Expand All @@ -65,7 +81,7 @@
),
(
"Mars",
["ˈmɑːz"],
["/ˈmɑːz/", "/ˈmɑɹz/"],
[
"From Middle English <i>Mars</i>, from Latin <i>Mārs</i> (“god of war”), from older Latin (older than 75 <small>B.C.E.</small>) <i>Māvors</i>. <i>𐌌𐌀𐌌𐌄𐌓𐌔</i> was his Oscan name. He was also known as <i>Marmor</i>, <i>Marmar</i> and <i>Maris</i>, the latter from the Etruscan deity Maris." # noqa
],
Expand All @@ -79,7 +95,7 @@
),
(
"portmanteau",
["pɔːtˈmæn.təʊ"],
["/pɔːtˈmæn.təʊ/", "/pɔːɹtˈmæntoʊ/", "/ˌpɔːɹtmænˈtoʊ/"],
[
"Middle French <i>portemanteau</i> (“coat stand”), from <i>porte</i> (“carry”) + <i>manteau</i> (“coat”)." # noqa
],
Expand All @@ -95,7 +111,7 @@
),
(
"someone",
["ˈsʌmwʌn"],
["/ˈsʌmwʌn/"],
["From <i>some</i>&nbsp;+&nbsp;<i>one</i>."],
[
"some person.",
Expand All @@ -105,7 +121,7 @@
),
(
"the",
["ˈðiː"],
["/ˈðiː/", "/ˈðʌ/", "/ði/", "/ðɪ/", "/ðə/"],
[
"From Middle English <i>þe</i>, from Old English <i>þē</i> <i>m</i> (“the, that”, demonstrative pronoun), a late variant of <i>sē</i>, the <i>s-</i> (which occurred in the masculine and feminine nominative singular only) having been replaced by the <i>þ-</i> from the oblique stem.", # noqa
"Originally neutral nominative, in Middle English it superseded all previous Old English nominative forms (<i>sē</i> <i>m</i>, <i>sēo</i> <i>f</i>, <i>þæt</i> <i>n</i>, <i>þā</i> <i>p</i>); <i>sē</i> is from Proto-West Germanic <i>*siz</i>, from Proto-Germanic <i>*sa</i>, ultimately from Proto-Indo-European <i>*só</i>.", # noqa
Expand All @@ -129,7 +145,7 @@
),
(
"um",
["ʌm", "əːm"],
["/ʌm/", "/əːm/"],
["Onomatopoeic."],
[
"<i>Expression of hesitation, uncertainty or space filler in conversation</i>. See uh.",
Expand All @@ -141,7 +157,7 @@
),
(
"us",
["ʌs", "ʌz"],
["/ʌs/", "/ʌz/", "/əs/", "/əz/"],
[
"From Middle English <i>us</i>, from Old English <i>ūs</i> (“us”, dative personal pronoun), from Proto-Germanic <i>*uns</i> (“us”), from Proto-Indo-European <i>*ne-</i>, <i>*nō-</i>, <i>*n-ge-</i>, <i>*n̥smé</i> (“us”). Cognate with Saterland Frisian <i>uus</i> (“us”), West Frisian <i>us</i>, <i>ús</i> (“us”), Low German <i>us</i> (“us”), Dutch <i>ons</i> (“us”), German <i>uns</i> (“us”), Danish <i>os</i> (“us”), Latin <i>nōs</i> (“we, us”)." # noqa
],
Expand All @@ -157,7 +173,17 @@
),
(
"water",
["ˈwɔːtə"],
[
"/ˈwɔːtə/",
"/ˈwɔtər/",
"/ˈwɒtə/",
"/ˈwɒtəɹ/",
"/ˈwɔtəɹ/",
"/ˈwɑtəɹ/",
"/ˈwʊtəɹ/",
"/ˈwoːtə/",
"/ˈwætəɹ/",
],
[
"From Middle English <i>water</i>, from Old English <i>wæter</i> (“water”), from Proto-West Germanic <i>*watar</i>, from Proto-Germanic <i>*watōr</i> (“water”), from Proto-Indo-European <i>*wódr̥</i> (“water”).", # noqa
"Cognate with cf, North Frisian <i>weeter</i> (“water”), Saterland Frisian <i>Woater</i> (“water”), West Frisian <i>wetter</i> (“water”), Dutch <i>water</i> (“water”), Low German <i>Water</i> (“water”), German <i>Wasser</i>, Old Norse <i>vatn</i> (Swedish <i>vatten</i> (“water”), Danish <i>vand</i> (“water”), Norwegian Bokmål <i>vann</i> (“water”), Norwegian Nynorsk and Icelandic <i>vatn</i> (“water”)), Old Irish <i>coin fodorne</i> (“otters”, literally “water-dogs”), Latin <i>unda</i> (“wave”), Lithuanian <i>vanduõ</i> (“water”), Russian <i>вода́</i> (<i>voda</i>, “water”), Albanian <i>ujë</i> (“water”), Ancient Greek <i>ὕδωρ</i> (“water”), Armenian <i>գետ</i> (<i>get</i>, “river”), Sanskrit <i>उदन्</i> (<i>udán</i>, “wave, water”), Hittite <i>𒉿𒀀𒋻</i> (<i>wa-a-tar</i>).", # noqa
Expand Down Expand Up @@ -198,7 +224,7 @@
),
(
"word",
["wɜːd"],
["/wɜːd/", "/wɝd/"],
[
"From Middle English <i>word</i>, from Old English <i>word</i>, from Proto-West Germanic <i>*word</i>, from Proto-Germanic <i>*wurdą</i>, from Proto-Indo-European <i>*wr̥dʰh₁om</i>. Doublet of <i>verb</i> and <i>verve</i>; further related to <b>vrata</b>." # noqa
],
Expand Down
Loading

0 comments on commit 278c4c2

Please sign in to comment.