From 9b6b187b2b67ad279092d3f36f3dd4d64b8994a9 Mon Sep 17 00:00:00 2001
From: Alexander Seifert <alexander.seifert@gmail.com>
Date: Mon, 19 Mar 2018 23:44:25 +0100
Subject: [PATCH] internationalize word boundary checks

---
 flashtext/keyword.py | 45 ++++++++++----------------------------------
 1 file changed, 10 insertions(+), 35 deletions(-)

diff --git a/flashtext/keyword.py b/flashtext/keyword.py
index 8639a49..fe20444 100644
--- a/flashtext/keyword.py
+++ b/flashtext/keyword.py
@@ -1,6 +1,7 @@
 import os
 import string
 import io
+import re
 
 
 class KeywordProcessor(object):
@@ -9,8 +10,6 @@ class KeywordProcessor(object):
     Attributes:
         _keyword (str): Used as key to store keywords in trie dictionary.
             Defaults to '_keyword_'
-        non_word_boundaries (set(str)): Characters that will determine if the word is continuing.
-            Defaults to set([A-Za-z0-9_])
         keyword_trie_dict (dict): Trie dict built character by character, that is used for lookup
             Defaults to empty dictionary
         case_sensitive (boolean): if the search algorithm should be case sensitive or not.
@@ -35,6 +34,9 @@ class KeywordProcessor(object):
         * Idea came from this `Stack Overflow Question <https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.
     """
 
+    NON_WORD_CHAR_REGEX = re.compile(r'\W', re.UNICODE)
+
+
     def __init__(self, case_sensitive=False):
         """
         Args:
@@ -42,13 +44,6 @@ def __init__(self, case_sensitive=False):
                 Defaults to False
         """
         self._keyword = '_keyword_'
-        self._white_space_chars = set(['.', '\t', '\n', '\a', ' ', ','])
-        try:
-            # python 2.x
-            self.non_word_boundaries = set(string.digits + string.letters + '_')
-        except AttributeError:
-            # python 3.x
-            self.non_word_boundaries = set(string.digits + string.ascii_letters + '_')
         self.keyword_trie_dict = dict()
         self.case_sensitive = case_sensitive
         self._terms_in_trie = 0
@@ -202,26 +197,6 @@ def __iter__(self):
         """
         raise NotImplementedError("Please use get_all_keywords() instead")
 
-    def set_non_word_boundaries(self, non_word_boundaries):
-        """set of characters that will be considered as part of word.
-
-        Args:
-            non_word_boundaries (set(str)):
-                Set of characters that will be considered as part of word.
-
-        """
-        self.non_word_boundaries = non_word_boundaries
-
-    def add_non_word_boundary(self, character):
-        """add a character that will be considered as part of word.
-
-        Args:
-            character (char):
-                Character that will be considered as part of word.
-
-        """
-        self.non_word_boundaries.add(character)
-
     def add_keyword(self, keyword, clean_name=None):
         """To add one or more keywords to the dictionary
         pass the keyword and the clean name it maps to.
@@ -482,7 +457,7 @@ def extract_keywords(self, sentence, span_info=False):
         while idx < sentence_len:
             char = sentence[idx]
             # when we reach a character that might denote word end
-            if char not in self.non_word_boundaries:
+            if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
 
                 # if end is present in current_dict
                 if self._keyword in current_dict or char in current_dict:
@@ -502,7 +477,7 @@ def extract_keywords(self, sentence, span_info=False):
                         idy = idx + 1
                         while idy < sentence_len:
                             inner_char = sentence[idy]
-                            if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued:
+                            if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued:
                                 # update longest sequence found
                                 longest_sequence_found = current_dict_continued[self._keyword]
                                 sequence_end_pos = idy
@@ -540,7 +515,7 @@ def extract_keywords(self, sentence, span_info=False):
                 idy = idx + 1
                 while idy < sentence_len:
                     char = sentence[idy]
-                    if char not in self.non_word_boundaries:
+                    if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
                         break
                     idy += 1
                 idx = idy
@@ -594,7 +569,7 @@ def replace_keywords(self, sentence):
             char = sentence[idx]
             current_word += orig_sentence[idx]
             # when we reach whitespace
-            if char not in self.non_word_boundaries:
+            if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
                 current_white_space = char
                 # if end is present in current_dict
                 if self._keyword in current_dict or char in current_dict:
@@ -615,7 +590,7 @@ def replace_keywords(self, sentence):
                         while idy < sentence_len:
                             inner_char = sentence[idy]
                             current_word_continued += orig_sentence[idy]
-                            if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued:
+                            if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued:
                                 # update longest sequence found
                                 current_white_space = inner_char
                                 longest_sequence_found = current_dict_continued[self._keyword]
@@ -663,7 +638,7 @@ def replace_keywords(self, sentence):
                 while idy < sentence_len:
                     char = sentence[idy]
                     current_word += orig_sentence[idy]
-                    if char not in self.non_word_boundaries:
+                    if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
                         break
                     idy += 1
                 idx = idy