From e9bd3552a0de4d9d030734aaa77ac8fb11cce016 Mon Sep 17 00:00:00 2001 From: Habeeb Shopeju Date: Fri, 12 Jun 2020 23:54:17 +0100 Subject: [PATCH 1/2] Created lexicons for cjk and non-cjk texts --- deltas/tokenizers/__init__.py | 5 +- .../tokenizers/tests/test_wikitext_split.py | 30 +++++++++++- deltas/tokenizers/wikitext_split.py | 49 ++++++------------- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/deltas/tokenizers/__init__.py b/deltas/tokenizers/__init__.py index dae5a46..5005d2a 100644 --- a/deltas/tokenizers/__init__.py +++ b/deltas/tokenizers/__init__.py @@ -16,6 +16,7 @@ from .tokenizer import Tokenizer, RegexTokenizer from .token import Token from .text_split import text_split -from .wikitext_split import wikitext_split +from .wikitext_split import wikitext_split, wikitext_split_cjk -__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split] +__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split, + wikitext_split_cjk] diff --git a/deltas/tokenizers/tests/test_wikitext_split.py b/deltas/tokenizers/tests/test_wikitext_split.py index 38032d8..14edb60 100644 --- a/deltas/tokenizers/tests/test_wikitext_split.py +++ b/deltas/tokenizers/tests/test_wikitext_split.py @@ -1,6 +1,6 @@ from nose.tools import eq_ -from ..wikitext_split import wikitext_split +from ..wikitext_split import wikitext_split, wikitext_split_cjk def test_wikitext_split(): @@ -150,6 +150,13 @@ def test_wikitext_split(): eq_(token, s) eq_(token.type, t) + tokens = list(wikitext_split_cjk.tokenize(input)) + + for token, (s, t) in zip(tokens, expected): + print(repr(token), (s, t)) + eq_(token, s) + eq_(token.type, t) + def test_arabic(): input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش." @@ -185,6 +192,13 @@ def test_arabic(): eq_(token, s) eq_(token.type, t) + tokens = list(wikitext_split_cjk.tokenize(input)) + + for token, (s, t) in zip(tokens, expected): + print(repr(token), (s, t)) + eq_(token, s) + eq_(token.type, t) + def test_hebrew(): input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \ @@ -242,6 +256,13 @@ def test_hebrew(): eq_(token, s) eq_(token.type, t) + tokens = list(wikitext_split_cjk.tokenize(input)) + + for token, (s, t) in zip(tokens, expected): + print(repr(token), (s, t)) + eq_(token, s) + eq_(token.type, t) + def test_hindi(): input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।' @@ -277,3 +298,10 @@ def test_hindi(): print(repr(token), (s, t)) eq_(token, s) eq_(token.type, t) + + tokens = list(wikitext_split_cjk.tokenize(input)) + + for token, (s, t) in zip(tokens, expected): + print(repr(token), (s, t)) + eq_(token, s) + eq_(token.type, t) diff --git a/deltas/tokenizers/wikitext_split.py b/deltas/tokenizers/wikitext_split.py index e9e6f0f..5524044 100644 --- a/deltas/tokenizers/wikitext_split.py +++ b/deltas/tokenizers/wikitext_split.py @@ -15,44 +15,20 @@ ) # re.compile(url, re.U).match("https://website.gov?param=value") -# Matches Chinese, Japanese and Korean characters. -cjk = ( - r'[' + - r'\u4E00-\u62FF' + # noqa Unified Ideographs - r'\u6300-\u77FF' + - r'\u7800-\u8CFF' + - r'\u8D00-\u9FCC' + - r'\u3400-\u4DFF' + # Unified Ideographs Ext A - r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B - r'\U00021600-\U000230FF' + - r'\U00023100-\U000245FF' + - r'\U00024600-\U000260FF' + - r'\U00026100-\U000275FF' + - r'\U00027600-\U000290FF' + - r'\U00029100-\U0002A6DF' + - r'\uF900-\uFAFF' + # Compatibility Ideographs - r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl. - r'\u3041-\u3096' + # Hiragana - r'\u30A0-\u30FF' + # Katakana - r'\u3400-\u4DB5' + # Kanji - r'\u4E00-\u9FCB' + - r'\uF900-\uFA6A' + - r'\u2E80-\u2FD5' + # Kanji radicals - r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width) - r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters - r'\u3220-\u3243' + - r'\u3280-\u337F' - r']' -) - devangari_word = r'\u0901-\u0963' arabic_word = r'\u0601-\u061A' + \ r'\u061C-\u0669' + \ r'\u06D5-\u06EF' bengali_word = r'\u0980-\u09FF' -combined_word = devangari_word + arabic_word + bengali_word +korean_word = r'\uac00-\ud7a3' + +combined_word = devangari_word + arabic_word + bengali_word + korean_word -word = r'(?:[^\W\d]|[' + combined_word + r'])' + \ +cjk_re = r'\u3040-\u30ff' + r'\u4e00-\u9FFF' + +cjk = r'[' + cjk_re + ']' + +word = r'(?:[^\W\d' + cjk_re + r']|[' + combined_word + r'])' + \ r'[\w' + combined_word + r']*' + \ r'(?:[\'’](?:[\w' + combined_word + r']+|(?=(?:$|\s))))*' @@ -71,7 +47,6 @@ ("bold", r"'''"), ("italic", r"''"), ('japan_punct', r'[\u3000-\u303F]'), - ('cjk', cjk), ('word', word), ('tab_open', r'\{\|'), ('tab_close', r'\|\}'), @@ -97,4 +72,10 @@ ("etc", r"."), ] -wikitext_split = RegexTokenizer(LEXICON) +LEXICON_LATIN = LEXICON.copy() +LEXICON_LATIN.insert(-2, ('cjk', cjk)) +wikitext_split = RegexTokenizer(LEXICON_LATIN) + +LEXICON_CJK = LEXICON.copy() +LEXICON_CJK.insert(0, ('cjk', cjk)) +wikitext_split_cjk = RegexTokenizer(LEXICON_CJK) From df6225b11ed7c46f9d5086aab180cacc2c629f15 Mon Sep 17 00:00:00 2001 From: Habeeb Shopeju Date: Mon, 22 Jun 2020 19:59:06 +0100 Subject: [PATCH 2/2] Refactored lexicons --- .travis.yml | 2 +- deltas/tokenizers/__init__.py | 5 ++-- .../tokenizers/tests/test_wikitext_split.py | 30 +------------------ deltas/tokenizers/wikitext_split.py | 10 ++----- 4 files changed, 6 insertions(+), 41 deletions(-) diff --git a/.travis.yml b/.travis.yml index 77df471..869daaf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: {} install: - pip install -r requirements.txt # - pip install -r docs/requirements.txt - - pip install -r test-requirements.txt + - pip install -r test-requirements.txt --upgrade - pip install twine script: - flake8 . --max-line-length=85 --exclude=.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg,docs diff --git a/deltas/tokenizers/__init__.py b/deltas/tokenizers/__init__.py index 5005d2a..dae5a46 100644 --- a/deltas/tokenizers/__init__.py +++ b/deltas/tokenizers/__init__.py @@ -16,7 +16,6 @@ from .tokenizer import Tokenizer, RegexTokenizer from .token import Token from .text_split import text_split -from .wikitext_split import wikitext_split, wikitext_split_cjk +from .wikitext_split import wikitext_split -__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split, - wikitext_split_cjk] +__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split] diff --git a/deltas/tokenizers/tests/test_wikitext_split.py b/deltas/tokenizers/tests/test_wikitext_split.py index 14edb60..38032d8 100644 --- a/deltas/tokenizers/tests/test_wikitext_split.py +++ b/deltas/tokenizers/tests/test_wikitext_split.py @@ -1,6 +1,6 @@ from nose.tools import eq_ -from ..wikitext_split import wikitext_split, wikitext_split_cjk +from ..wikitext_split import wikitext_split def test_wikitext_split(): @@ -150,13 +150,6 @@ def test_wikitext_split(): eq_(token, s) eq_(token.type, t) - tokens = list(wikitext_split_cjk.tokenize(input)) - - for token, (s, t) in zip(tokens, expected): - print(repr(token), (s, t)) - eq_(token, s) - eq_(token.type, t) - def test_arabic(): input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش." @@ -192,13 +185,6 @@ def test_arabic(): eq_(token, s) eq_(token.type, t) - tokens = list(wikitext_split_cjk.tokenize(input)) - - for token, (s, t) in zip(tokens, expected): - print(repr(token), (s, t)) - eq_(token, s) - eq_(token.type, t) - def test_hebrew(): input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \ @@ -256,13 +242,6 @@ def test_hebrew(): eq_(token, s) eq_(token.type, t) - tokens = list(wikitext_split_cjk.tokenize(input)) - - for token, (s, t) in zip(tokens, expected): - print(repr(token), (s, t)) - eq_(token, s) - eq_(token.type, t) - def test_hindi(): input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।' @@ -298,10 +277,3 @@ def test_hindi(): print(repr(token), (s, t)) eq_(token, s) eq_(token.type, t) - - tokens = list(wikitext_split_cjk.tokenize(input)) - - for token, (s, t) in zip(tokens, expected): - print(repr(token), (s, t)) - eq_(token, s) - eq_(token.type, t) diff --git a/deltas/tokenizers/wikitext_split.py b/deltas/tokenizers/wikitext_split.py index 5524044..22ab166 100644 --- a/deltas/tokenizers/wikitext_split.py +++ b/deltas/tokenizers/wikitext_split.py @@ -13,7 +13,6 @@ r'(?:(?:' + '|'.join(SLASHED_PROTO) + r')\:)?\/\/' + r')' + ADDRESS ) -# re.compile(url, re.U).match("https://website.gov?param=value") devangari_word = r'\u0901-\u0963' arabic_word = r'\u0601-\u061A' + \ @@ -48,6 +47,7 @@ ("italic", r"''"), ('japan_punct', r'[\u3000-\u303F]'), ('word', word), + ('cjk', cjk), ('tab_open', r'\{\|'), ('tab_close', r'\|\}'), ('dbrack_open', r'\[\['), @@ -72,10 +72,4 @@ ("etc", r"."), ] -LEXICON_LATIN = LEXICON.copy() -LEXICON_LATIN.insert(-2, ('cjk', cjk)) -wikitext_split = RegexTokenizer(LEXICON_LATIN) - -LEXICON_CJK = LEXICON.copy() -LEXICON_CJK.insert(0, ('cjk', cjk)) -wikitext_split_cjk = RegexTokenizer(LEXICON_CJK) +wikitext_split = RegexTokenizer(LEXICON)