From 9b6b187b2b67ad279092d3f36f3dd4d64b8994a9 Mon Sep 17 00:00:00 2001 From: Alexander Seifert Date: Mon, 19 Mar 2018 23:44:25 +0100 Subject: [PATCH] internationalize word boundary checks --- flashtext/keyword.py | 45 ++++++++++---------------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/flashtext/keyword.py b/flashtext/keyword.py index 8639a49..fe20444 100644 --- a/flashtext/keyword.py +++ b/flashtext/keyword.py @@ -1,6 +1,7 @@ import os import string import io +import re class KeywordProcessor(object): @@ -9,8 +10,6 @@ class KeywordProcessor(object): Attributes: _keyword (str): Used as key to store keywords in trie dictionary. Defaults to '_keyword_' - non_word_boundaries (set(str)): Characters that will determine if the word is continuing. - Defaults to set([A-Za-z0-9_]) keyword_trie_dict (dict): Trie dict built character by character, that is used for lookup Defaults to empty dictionary case_sensitive (boolean): if the search algorithm should be case sensitive or not. @@ -35,6 +34,9 @@ class KeywordProcessor(object): * Idea came from this `Stack Overflow Question `_. """ + NON_WORD_CHAR_REGEX = re.compile(r'\W', re.UNICODE) + + def __init__(self, case_sensitive=False): """ Args: @@ -42,13 +44,6 @@ def __init__(self, case_sensitive=False): Defaults to False """ self._keyword = '_keyword_' - self._white_space_chars = set(['.', '\t', '\n', '\a', ' ', ',']) - try: - # python 2.x - self.non_word_boundaries = set(string.digits + string.letters + '_') - except AttributeError: - # python 3.x - self.non_word_boundaries = set(string.digits + string.ascii_letters + '_') self.keyword_trie_dict = dict() self.case_sensitive = case_sensitive self._terms_in_trie = 0 @@ -202,26 +197,6 @@ def __iter__(self): """ raise NotImplementedError("Please use get_all_keywords() instead") - def set_non_word_boundaries(self, non_word_boundaries): - """set of characters that will be considered as part of word. - - Args: - non_word_boundaries (set(str)): - Set of characters that will be considered as part of word. - - """ - self.non_word_boundaries = non_word_boundaries - - def add_non_word_boundary(self, character): - """add a character that will be considered as part of word. - - Args: - character (char): - Character that will be considered as part of word. - - """ - self.non_word_boundaries.add(character) - def add_keyword(self, keyword, clean_name=None): """To add one or more keywords to the dictionary pass the keyword and the clean name it maps to. @@ -482,7 +457,7 @@ def extract_keywords(self, sentence, span_info=False): while idx < sentence_len: char = sentence[idx] # when we reach a character that might denote word end - if char not in self.non_word_boundaries: + if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char): # if end is present in current_dict if self._keyword in current_dict or char in current_dict: @@ -502,7 +477,7 @@ def extract_keywords(self, sentence, span_info=False): idy = idx + 1 while idy < sentence_len: inner_char = sentence[idy] - if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: + if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued: # update longest sequence found longest_sequence_found = current_dict_continued[self._keyword] sequence_end_pos = idy @@ -540,7 +515,7 @@ def extract_keywords(self, sentence, span_info=False): idy = idx + 1 while idy < sentence_len: char = sentence[idy] - if char not in self.non_word_boundaries: + if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char): break idy += 1 idx = idy @@ -594,7 +569,7 @@ def replace_keywords(self, sentence): char = sentence[idx] current_word += orig_sentence[idx] # when we reach whitespace - if char not in self.non_word_boundaries: + if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char): current_white_space = char # if end is present in current_dict if self._keyword in current_dict or char in current_dict: @@ -615,7 +590,7 @@ def replace_keywords(self, sentence): while idy < sentence_len: inner_char = sentence[idy] current_word_continued += orig_sentence[idy] - if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: + if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued: # update longest sequence found current_white_space = inner_char longest_sequence_found = current_dict_continued[self._keyword] @@ -663,7 +638,7 @@ def replace_keywords(self, sentence): while idy < sentence_len: char = sentence[idy] current_word += orig_sentence[idy] - if char not in self.non_word_boundaries: + if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char): break idy += 1 idx = idy