Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

internationalize word boundary checks #49

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 10 additions & 35 deletions flashtext/keyword.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import string
import io
import re


class KeywordProcessor(object):
Expand All @@ -9,8 +10,6 @@ class KeywordProcessor(object):
Attributes:
_keyword (str): Used as key to store keywords in trie dictionary.
Defaults to '_keyword_'
non_word_boundaries (set(str)): Characters that will determine if the word is continuing.
Defaults to set([A-Za-z0-9_])
keyword_trie_dict (dict): Trie dict built character by character, that is used for lookup
Defaults to empty dictionary
case_sensitive (boolean): if the search algorithm should be case sensitive or not.
Expand All @@ -35,20 +34,16 @@ class KeywordProcessor(object):
* Idea came from this `Stack Overflow Question <https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.
"""

NON_WORD_CHAR_REGEX = re.compile(r'\W', re.UNICODE)


def __init__(self, case_sensitive=False):
"""
Args:
case_sensitive (boolean): Keyword search should be case sensitive set or not.
Defaults to False
"""
self._keyword = '_keyword_'
self._white_space_chars = set(['.', '\t', '\n', '\a', ' ', ','])
try:
# python 2.x
self.non_word_boundaries = set(string.digits + string.letters + '_')
except AttributeError:
# python 3.x
self.non_word_boundaries = set(string.digits + string.ascii_letters + '_')
self.keyword_trie_dict = dict()
self.case_sensitive = case_sensitive
self._terms_in_trie = 0
Expand Down Expand Up @@ -202,26 +197,6 @@ def __iter__(self):
"""
raise NotImplementedError("Please use get_all_keywords() instead")

def set_non_word_boundaries(self, non_word_boundaries):
"""set of characters that will be considered as part of word.

Args:
non_word_boundaries (set(str)):
Set of characters that will be considered as part of word.

"""
self.non_word_boundaries = non_word_boundaries

def add_non_word_boundary(self, character):
"""add a character that will be considered as part of word.

Args:
character (char):
Character that will be considered as part of word.

"""
self.non_word_boundaries.add(character)

def add_keyword(self, keyword, clean_name=None):
"""To add one or more keywords to the dictionary
pass the keyword and the clean name it maps to.
Expand Down Expand Up @@ -482,7 +457,7 @@ def extract_keywords(self, sentence, span_info=False):
while idx < sentence_len:
char = sentence[idx]
# when we reach a character that might denote word end
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the ugly direct reference to the class? just use self


# if end is present in current_dict
if self._keyword in current_dict or char in current_dict:
Expand All @@ -502,7 +477,7 @@ def extract_keywords(self, sentence, span_info=False):
idy = idx + 1
while idy < sentence_len:
inner_char = sentence[idy]
if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued:
# update longest sequence found
longest_sequence_found = current_dict_continued[self._keyword]
sequence_end_pos = idy
Expand Down Expand Up @@ -540,7 +515,7 @@ def extract_keywords(self, sentence, span_info=False):
idy = idx + 1
while idy < sentence_len:
char = sentence[idy]
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
break
idy += 1
idx = idy
Expand Down Expand Up @@ -594,7 +569,7 @@ def replace_keywords(self, sentence):
char = sentence[idx]
current_word += orig_sentence[idx]
# when we reach whitespace
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
current_white_space = char
# if end is present in current_dict
if self._keyword in current_dict or char in current_dict:
Expand All @@ -615,7 +590,7 @@ def replace_keywords(self, sentence):
while idy < sentence_len:
inner_char = sentence[idy]
current_word_continued += orig_sentence[idy]
if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued:
# update longest sequence found
current_white_space = inner_char
longest_sequence_found = current_dict_continued[self._keyword]
Expand Down Expand Up @@ -663,7 +638,7 @@ def replace_keywords(self, sentence):
while idy < sentence_len:
char = sentence[idy]
current_word += orig_sentence[idy]
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
break
idy += 1
idx = idy
Expand Down