From e9bd3552a0de4d9d030734aaa77ac8fb11cce016 Mon Sep 17 00:00:00 2001
From: Habeeb Shopeju <shopejuh@gmail.com>
Date: Fri, 12 Jun 2020 23:54:17 +0100
Subject: [PATCH 1/2] Created lexicons for cjk and non-cjk texts

---
 deltas/tokenizers/__init__.py                 |  5 +-
 .../tokenizers/tests/test_wikitext_split.py   | 30 +++++++++++-
 deltas/tokenizers/wikitext_split.py           | 49 ++++++-------------
 3 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/deltas/tokenizers/__init__.py b/deltas/tokenizers/__init__.py
index dae5a46..5005d2a 100644
--- a/deltas/tokenizers/__init__.py
+++ b/deltas/tokenizers/__init__.py
@@ -16,6 +16,7 @@
 from .tokenizer import Tokenizer, RegexTokenizer
 from .token import Token
 from .text_split import text_split
-from .wikitext_split import wikitext_split
+from .wikitext_split import wikitext_split, wikitext_split_cjk
 
-__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split]
+__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split,
+           wikitext_split_cjk]
diff --git a/deltas/tokenizers/tests/test_wikitext_split.py b/deltas/tokenizers/tests/test_wikitext_split.py
index 38032d8..14edb60 100644
--- a/deltas/tokenizers/tests/test_wikitext_split.py
+++ b/deltas/tokenizers/tests/test_wikitext_split.py
@@ -1,6 +1,6 @@
 from nose.tools import eq_
 
-from ..wikitext_split import wikitext_split
+from ..wikitext_split import wikitext_split, wikitext_split_cjk
 
 
 def test_wikitext_split():
@@ -150,6 +150,13 @@ def test_wikitext_split():
         eq_(token, s)
         eq_(token.type, t)
 
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
+
 
 def test_arabic():
     input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش."
@@ -185,6 +192,13 @@ def test_arabic():
         eq_(token, s)
         eq_(token.type, t)
 
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
+
 
 def test_hebrew():
     input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \
@@ -242,6 +256,13 @@ def test_hebrew():
         eq_(token, s)
         eq_(token.type, t)
 
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
+
 
 def test_hindi():
     input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।'
@@ -277,3 +298,10 @@ def test_hindi():
         print(repr(token), (s, t))
         eq_(token, s)
         eq_(token.type, t)
+
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
diff --git a/deltas/tokenizers/wikitext_split.py b/deltas/tokenizers/wikitext_split.py
index e9e6f0f..5524044 100644
--- a/deltas/tokenizers/wikitext_split.py
+++ b/deltas/tokenizers/wikitext_split.py
@@ -15,44 +15,20 @@
 )
 # re.compile(url, re.U).match("https://website.gov?param=value")
 
-# Matches Chinese, Japanese and Korean characters.
-cjk = (
-    r'[' +
-        r'\u4E00-\u62FF' +  # noqa Unified Ideographs
-        r'\u6300-\u77FF' +
-        r'\u7800-\u8CFF' +
-        r'\u8D00-\u9FCC' +
-        r'\u3400-\u4DFF' +  # Unified Ideographs Ext A
-        r'\U00020000-\U000215FF' +  # Unified Ideographs Ext. B
-        r'\U00021600-\U000230FF' +
-        r'\U00023100-\U000245FF' +
-        r'\U00024600-\U000260FF' +
-        r'\U00026100-\U000275FF' +
-        r'\U00027600-\U000290FF' +
-        r'\U00029100-\U0002A6DF' +
-        r'\uF900-\uFAFF' +  # Compatibility Ideographs
-        r'\U0002F800-\U0002FA1F' +  # Compatibility Ideographs Suppl.
-        r'\u3041-\u3096' +  # Hiragana
-        r'\u30A0-\u30FF' +  # Katakana
-        r'\u3400-\u4DB5' +  # Kanji
-        r'\u4E00-\u9FCB' +
-        r'\uF900-\uFA6A' +
-        r'\u2E80-\u2FD5' +  # Kanji radicals
-        r'\uFF5F-\uFF9F' +  # Katakana and Punctuation (Half Width)
-        r'\u31F0-\u31FF' +  # Miscellaneous Japanese Symbols and Characters
-        r'\u3220-\u3243' +
-        r'\u3280-\u337F'
-    r']'
-)
-
 devangari_word = r'\u0901-\u0963'
 arabic_word = r'\u0601-\u061A' + \
               r'\u061C-\u0669' + \
               r'\u06D5-\u06EF'
 bengali_word = r'\u0980-\u09FF'
-combined_word = devangari_word + arabic_word + bengali_word
+korean_word = r'\uac00-\ud7a3'
+
+combined_word = devangari_word + arabic_word + bengali_word + korean_word
 
-word = r'(?:[^\W\d]|[' + combined_word + r'])' + \
+cjk_re = r'\u3040-\u30ff' + r'\u4e00-\u9FFF'
+
+cjk = r'[' + cjk_re + ']'
+
+word = r'(?:[^\W\d' + cjk_re + r']|[' + combined_word + r'])' + \
        r'[\w' + combined_word + r']*' + \
        r'(?:[\'’](?:[\w' + combined_word + r']+|(?=(?:$|\s))))*'
 
@@ -71,7 +47,6 @@
     ("bold", r"'''"),
     ("italic", r"''"),
     ('japan_punct', r'[\u3000-\u303F]'),
-    ('cjk', cjk),
     ('word', word),
     ('tab_open', r'\{\|'),
     ('tab_close', r'\|\}'),
@@ -97,4 +72,10 @@
     ("etc", r"."),
 ]
 
-wikitext_split = RegexTokenizer(LEXICON)
+LEXICON_LATIN = LEXICON.copy()
+LEXICON_LATIN.insert(-2, ('cjk', cjk))
+wikitext_split = RegexTokenizer(LEXICON_LATIN)
+
+LEXICON_CJK = LEXICON.copy()
+LEXICON_CJK.insert(0, ('cjk', cjk))
+wikitext_split_cjk = RegexTokenizer(LEXICON_CJK)

From df6225b11ed7c46f9d5086aab180cacc2c629f15 Mon Sep 17 00:00:00 2001
From: Habeeb Shopeju <shopejuh@gmail.com>
Date: Mon, 22 Jun 2020 19:59:06 +0100
Subject: [PATCH 2/2] Refactored lexicons

---
 .travis.yml                                   |  2 +-
 deltas/tokenizers/__init__.py                 |  5 ++--
 .../tokenizers/tests/test_wikitext_split.py   | 30 +------------------
 deltas/tokenizers/wikitext_split.py           | 10 ++-----
 4 files changed, 6 insertions(+), 41 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 77df471..869daaf 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,7 +14,7 @@ before_install: {}
 install:
   - pip install -r requirements.txt
 #  - pip install -r docs/requirements.txt
-  - pip install -r test-requirements.txt
+  - pip install -r test-requirements.txt --upgrade
   - pip install twine
 script:
   - flake8 . --max-line-length=85 --exclude=.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg,docs
diff --git a/deltas/tokenizers/__init__.py b/deltas/tokenizers/__init__.py
index 5005d2a..dae5a46 100644
--- a/deltas/tokenizers/__init__.py
+++ b/deltas/tokenizers/__init__.py
@@ -16,7 +16,6 @@
 from .tokenizer import Tokenizer, RegexTokenizer
 from .token import Token
 from .text_split import text_split
-from .wikitext_split import wikitext_split, wikitext_split_cjk
+from .wikitext_split import wikitext_split
 
-__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split,
-           wikitext_split_cjk]
+__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split]
diff --git a/deltas/tokenizers/tests/test_wikitext_split.py b/deltas/tokenizers/tests/test_wikitext_split.py
index 14edb60..38032d8 100644
--- a/deltas/tokenizers/tests/test_wikitext_split.py
+++ b/deltas/tokenizers/tests/test_wikitext_split.py
@@ -1,6 +1,6 @@
 from nose.tools import eq_
 
-from ..wikitext_split import wikitext_split, wikitext_split_cjk
+from ..wikitext_split import wikitext_split
 
 
 def test_wikitext_split():
@@ -150,13 +150,6 @@ def test_wikitext_split():
         eq_(token, s)
         eq_(token.type, t)
 
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
-
 
 def test_arabic():
     input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش."
@@ -192,13 +185,6 @@ def test_arabic():
         eq_(token, s)
         eq_(token.type, t)
 
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
-
 
 def test_hebrew():
     input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \
@@ -256,13 +242,6 @@ def test_hebrew():
         eq_(token, s)
         eq_(token.type, t)
 
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
-
 
 def test_hindi():
     input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।'
@@ -298,10 +277,3 @@ def test_hindi():
         print(repr(token), (s, t))
         eq_(token, s)
         eq_(token.type, t)
-
-    tokens = list(wikitext_split_cjk.tokenize(input))
-
-    for token, (s, t) in zip(tokens, expected):
-        print(repr(token), (s, t))
-        eq_(token, s)
-        eq_(token.type, t)
diff --git a/deltas/tokenizers/wikitext_split.py b/deltas/tokenizers/wikitext_split.py
index 5524044..22ab166 100644
--- a/deltas/tokenizers/wikitext_split.py
+++ b/deltas/tokenizers/wikitext_split.py
@@ -13,7 +13,6 @@
         r'(?:(?:' + '|'.join(SLASHED_PROTO) + r')\:)?\/\/' +
     r')' + ADDRESS
 )
-# re.compile(url, re.U).match("https://website.gov?param=value")
 
 devangari_word = r'\u0901-\u0963'
 arabic_word = r'\u0601-\u061A' + \
@@ -48,6 +47,7 @@
     ("italic", r"''"),
     ('japan_punct', r'[\u3000-\u303F]'),
     ('word', word),
+    ('cjk', cjk),
     ('tab_open', r'\{\|'),
     ('tab_close', r'\|\}'),
     ('dbrack_open', r'\[\['),
@@ -72,10 +72,4 @@
     ("etc", r"."),
 ]
 
-LEXICON_LATIN = LEXICON.copy()
-LEXICON_LATIN.insert(-2, ('cjk', cjk))
-wikitext_split = RegexTokenizer(LEXICON_LATIN)
-
-LEXICON_CJK = LEXICON.copy()
-LEXICON_CJK.insert(0, ('cjk', cjk))
-wikitext_split_cjk = RegexTokenizer(LEXICON_CJK)
+wikitext_split = RegexTokenizer(LEXICON)