Skip to content

Commit

Permalink
Merge pull request #37 from goru001/add_english_to_inltk
Browse files Browse the repository at this point in the history
fix en tokenization bug
  • Loading branch information
goru001 authored Jan 17, 2020
2 parents f066b6b + 4119761 commit c39d9ba
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 3 deletions.
2 changes: 1 addition & 1 deletion inltk/inltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def get_similar_sentences(sen: str, no_of_variations: int, language_code: str, d
word_ids = [np.argpartition(-np.array(score), no_of_variations+1)[:no_of_variations+1] for score in scores]
word_ids = [ids.tolist() for ids in word_ids]
for i, ids in enumerate(word_ids):
word_ids[i].remove(token_ids[i])
word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]]
# generating more variations than required so that we can then filter out the best ones
buffer_multiplicity = 2
new_sen_tokens = []
Expand Down
5 changes: 4 additions & 1 deletion inltk/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def __init__(self, lang: str):
self.tok = SpacyTokenizer(lang)

def tokenizer(self, t: str) -> List[str]:
return self.tok.tokenizer(t)
tok = Tokenizer()
tokens = tok.process_text(t, self.tok)
tokens = [token for token in tokens if token not in defaults.text_spec_tok]
return tokens

def numericalize(self, t: str):
token_ids = self.tokenizer(t)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="inltk",
version="0.8",
version="0.8.1",
author="Gaurav",
author_email="[email protected]",
description="Natural Language Toolkit for Indian Languages (iNLTK)",
Expand Down

0 comments on commit c39d9ba

Please sign in to comment.