Skip to content

Commit

Permalink
Refactored lexicons
Browse files Browse the repository at this point in the history
  • Loading branch information
HAKSOAT committed Jun 22, 2020
1 parent e9bd355 commit f3489da
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 41 deletions.
5 changes: 2 additions & 3 deletions deltas/tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from .tokenizer import Tokenizer, RegexTokenizer
from .token import Token
from .text_split import text_split
from .wikitext_split import wikitext_split, wikitext_split_cjk
from .wikitext_split import wikitext_split

__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split,
wikitext_split_cjk]
__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split]
30 changes: 1 addition & 29 deletions deltas/tokenizers/tests/test_wikitext_split.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from nose.tools import eq_

from ..wikitext_split import wikitext_split, wikitext_split_cjk
from ..wikitext_split import wikitext_split


def test_wikitext_split():
Expand Down Expand Up @@ -150,13 +150,6 @@ def test_wikitext_split():
eq_(token, s)
eq_(token.type, t)

tokens = list(wikitext_split_cjk.tokenize(input))

for token, (s, t) in zip(tokens, expected):
print(repr(token), (s, t))
eq_(token, s)
eq_(token.type, t)


def test_arabic():
input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش."
Expand Down Expand Up @@ -192,13 +185,6 @@ def test_arabic():
eq_(token, s)
eq_(token.type, t)

tokens = list(wikitext_split_cjk.tokenize(input))

for token, (s, t) in zip(tokens, expected):
print(repr(token), (s, t))
eq_(token, s)
eq_(token.type, t)


def test_hebrew():
input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \
Expand Down Expand Up @@ -256,13 +242,6 @@ def test_hebrew():
eq_(token, s)
eq_(token.type, t)

tokens = list(wikitext_split_cjk.tokenize(input))

for token, (s, t) in zip(tokens, expected):
print(repr(token), (s, t))
eq_(token, s)
eq_(token.type, t)


def test_hindi():
input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।'
Expand Down Expand Up @@ -298,10 +277,3 @@ def test_hindi():
print(repr(token), (s, t))
eq_(token, s)
eq_(token.type, t)

tokens = list(wikitext_split_cjk.tokenize(input))

for token, (s, t) in zip(tokens, expected):
print(repr(token), (s, t))
eq_(token, s)
eq_(token.type, t)
10 changes: 2 additions & 8 deletions deltas/tokenizers/wikitext_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
r'(?:(?:' + '|'.join(SLASHED_PROTO) + r')\:)?\/\/' +
r')' + ADDRESS
)
# re.compile(url, re.U).match("https://website.gov?param=value")

devangari_word = r'\u0901-\u0963'
arabic_word = r'\u0601-\u061A' + \
Expand Down Expand Up @@ -48,6 +47,7 @@
("italic", r"''"),
('japan_punct', r'[\u3000-\u303F]'),
('word', word),
('cjk', cjk),
('tab_open', r'\{\|'),
('tab_close', r'\|\}'),
('dbrack_open', r'\[\['),
Expand All @@ -72,10 +72,4 @@
("etc", r"."),
]

LEXICON_LATIN = LEXICON.copy()
LEXICON_LATIN.insert(-2, ('cjk', cjk))
wikitext_split = RegexTokenizer(LEXICON_LATIN)

LEXICON_CJK = LEXICON.copy()
LEXICON_CJK.insert(0, ('cjk', cjk))
wikitext_split_cjk = RegexTokenizer(LEXICON_CJK)
wikitext_split = RegexTokenizer(LEXICON)
2 changes: 1 addition & 1 deletion test-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ pytest
nose
flake8
codecov
pytest-cov
pytest-cov<=2.6.0

0 comments on commit f3489da

Please sign in to comment.