Refactored lexicons

halfak · Jun 22, 2020 · f3489da · f3489da
1 parent e9bd355
commit f3489da
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 41 deletions.
diff --git a/deltas/tokenizers/__init__.py b/deltas/tokenizers/__init__.py
@@ -16,7 +16,6 @@
 from .tokenizer import Tokenizer, RegexTokenizer
 from .token import Token
 from .text_split import text_split
-from .wikitext_split import wikitext_split, wikitext_split_cjk
+from .wikitext_split import wikitext_split
 
-__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split,
-           wikitext_split_cjk]
+__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split]
diff --git a/deltas/tokenizers/tests/test_wikitext_split.py b/deltas/tokenizers/tests/test_wikitext_split.py
@@ -1,6 +1,6 @@
 from nose.tools import eq_
 
-from ..wikitext_split import wikitext_split, wikitext_split_cjk
+from ..wikitext_split import wikitext_split
 
 
 def test_wikitext_split():
@@ -150,13 +150,6 @@ def test_wikitext_split():
         eq_(token, s)
         eq_(token.type, t)
 
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
-
 
 def test_arabic():
     input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش."
@@ -192,13 +185,6 @@ def test_arabic():
         eq_(token, s)
         eq_(token.type, t)
 
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
-
 
 def test_hebrew():
     input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \
@@ -256,13 +242,6 @@ def test_hebrew():
         eq_(token, s)
         eq_(token.type, t)
 
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
-
 
 def test_hindi():
     input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।'
@@ -298,10 +277,3 @@ def test_hindi():
         print(repr(token), (s, t))
         eq_(token, s)
         eq_(token.type, t)
-
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
diff --git a/deltas/tokenizers/wikitext_split.py b/deltas/tokenizers/wikitext_split.py
@@ -13,7 +13,6 @@
         r'(?:(?:' + '|'.join(SLASHED_PROTO) + r')\:)?\/\/' +
     r')' + ADDRESS
 )
-# re.compile(url, re.U).match("https://website.gov?param=value")
 
 devangari_word = r'\u0901-\u0963'
 arabic_word = r'\u0601-\u061A' + \
@@ -48,6 +47,7 @@
     ("italic", r"''"),
     ('japan_punct', r'[\u3000-\u303F]'),
     ('word', word),
+    ('cjk', cjk),
     ('tab_open', r'\{\|'),
     ('tab_close', r'\|\}'),
     ('dbrack_open', r'\[\['),
@@ -72,10 +72,4 @@
     ("etc", r"."),
 ]
 
-LEXICON_LATIN = LEXICON.copy()
-LEXICON_LATIN.insert(-2, ('cjk', cjk))
-wikitext_split = RegexTokenizer(LEXICON_LATIN)
-
-LEXICON_CJK = LEXICON.copy()
-LEXICON_CJK.insert(0, ('cjk', cjk))
-wikitext_split_cjk = RegexTokenizer(LEXICON_CJK)
+wikitext_split = RegexTokenizer(LEXICON)
diff --git a/test-requirements.txt b/test-requirements.txt
@@ -2,4 +2,4 @@ pytest
 nose
 flake8
 codecov
-pytest-cov
+pytest-cov<=2.6.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,4 @@ pytest @@
     nose
     flake8
     codecov
-    pytest-cov
+    pytest-cov<=2.6.0