Merge pull request #37 from goru001/add_english_to_inltk

fix en tokenization bug
goru001 · Jan 17, 2020 · c39d9ba · c39d9ba
2 parents f066b6b + 4119761
commit c39d9ba
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 3 deletions.
diff --git a/inltk/inltk.py b/inltk/inltk.py
@@ -143,7 +143,7 @@ def get_similar_sentences(sen: str, no_of_variations: int, language_code: str, d
     word_ids = [np.argpartition(-np.array(score), no_of_variations+1)[:no_of_variations+1] for score in scores]
     word_ids = [ids.tolist() for ids in word_ids]
     for i, ids in enumerate(word_ids):
-        word_ids[i].remove(token_ids[i])
+        word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]]
     # generating more variations than required so that we can then filter out the best ones
     buffer_multiplicity = 2
     new_sen_tokens = []

diff --git a/inltk/tokenizer.py b/inltk/tokenizer.py
@@ -35,7 +35,10 @@ def __init__(self, lang: str):
         self.tok = SpacyTokenizer(lang)
 
     def tokenizer(self, t: str) -> List[str]:
-        return self.tok.tokenizer(t)
+        tok = Tokenizer()
+        tokens = tok.process_text(t, self.tok)
+        tokens = [token for token in tokens if token not in defaults.text_spec_tok]
+        return tokens
 
     def numericalize(self, t: str):
         token_ids = self.tokenizer(t)

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="inltk",
-    version="0.8",
+    version="0.8.1",
     author="Gaurav",
     author_email="[email protected]",
     description="Natural Language Toolkit for Indian Languages (iNLTK)",