From 44325476321a829c2562f2e0f6f863030c2fe870 Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Sun, 31 Jan 2021 19:22:27 +0530 Subject: [PATCH 1/8] Adding suffix --- pydatastructs/trees/__init__.py | 8 +- pydatastructs/trees/suffix_tree.py | 260 +++++++++++++++++++ pydatastructs/trees/tests/test_suffixtree.py | 7 + 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 pydatastructs/trees/suffix_tree.py create mode 100644 pydatastructs/trees/tests/test_suffixtree.py diff --git a/pydatastructs/trees/__init__.py b/pydatastructs/trees/__init__.py index 6b9df8a22..ad0c26c5b 100644 --- a/pydatastructs/trees/__init__.py +++ b/pydatastructs/trees/__init__.py @@ -4,7 +4,8 @@ binary_trees, m_ary_trees, space_partitioning_trees, - heaps + heaps, + suffix_tree ) from .binary_trees import ( @@ -38,3 +39,8 @@ BinomialHeap ) __all__.extend(heaps.__all__) + +from .suffix_tree import( + SuffixTree +) +__all__.extend(suffix_tree.__all__) diff --git a/pydatastructs/trees/suffix_tree.py b/pydatastructs/trees/suffix_tree.py new file mode 100644 index 000000000..30c8ecfb4 --- /dev/null +++ b/pydatastructs/trees/suffix_tree.py @@ -0,0 +1,260 @@ +__all__ = [ + 'SuffixTree' +] + +class Suffix_Node(): + + __slots__ = ['_suffix_link', 'transition_links', 'idx', 'depth', 'parent', 'generalized_idxs'] + + def __new__(cls, idx=-1, parentNode=None, depth=-1): + obj = object.__new__(cls) + obj._suffix_link = None + obj.transition_links = {} + obj.idx = idx + obj.depth = depth + obj.parent = parentNode + obj.generalized_idxs = {} + return obj + + def __str__(self): + return ("Suffix Node: idx:" + str(self.idx) + " depth:" + str(self.depth) + " transitons:" + str(list(self.transition_links.keys()))) + + def _add_suffix_link(self, snode): + self._suffix_link = snode + + def _get_suffix_link(self): + if self._suffix_link is not None: + return self._suffix_link + else: + return False + + def _get_transition_link(self, suffix): + return False if suffix not in self.transition_links else self.transition_links[suffix] + + def _add_transition_link(self, snode, suffix): + self.transition_links[suffix] = snode + + def _has_transition(self, suffix): + return suffix in self.transition_links + + def is_leaf(self): + return len(self.transition_links) == 0 + + def _traverse(self, f): + for node in self.transition_links.values(): + node._traverse(f) + f(self) + + def _get_leaves(self): + if self.is_leaf(): + return {self} + else: + return {x for n in self.transition_links.values() for x in n._get_leaves()} + +class SuffixTree(): + """ + Represents Suffix Tree. + + Examples + ======== + + >>> from pydatastructs.trees import SuffixTree as suffix + >>> s = suffix('hello') + >>> s.find('he') + 0 + >>> s.find_all('l') + {2, 3} + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Suffix_tree + .. [2] https://en.wikipedia.org/wiki/Generalized_suffix_tree + """ + + def __new__(cls, input=''): + obj = object.__new__(cls) + obj.root = Suffix_Node() + obj.root.depth = 0 + obj.root.idx = 0 + obj.root.parent = obj.root + obj.root._add_suffix_link(obj.root) + if not input == '': + obj.build(input) + return obj + + @classmethod + def methods(cls): + return ['__new__', '__str__', 'lcs', 'find', 'find_all'] + + def _check_input(self, input): + if isinstance(input, str): + return 'st' + elif isinstance(input, list): + if all(isinstance(item, str) for item in input): + return 'gst' + + raise ValueError("String argument should be of type String or a list of strings") + + def build(self, x): + type = self._check_input(x) + if type == 'st': + x += next(self._terminalSymbolsGenerator()) + self._build(x) + if type == 'gst': + self._build_generalized(x) + + def _build(self, x): + self.word = x + self._build_McCreight(x) + + def _build_McCreight(self, x): + u = self.root + d = 0 + for i in range(len(x)): + while u.depth == d and u._has_transition(x[d + i]): + u = u._get_transition_link(x[d + i]) + d = d + 1 + while d < u.depth and x[u.idx + d] == x[i + d]: + d = d + 1 + if d < u.depth: + u = self._create_node(x, u, d) + self._create_leaf(x, i, u, d) + if not u._get_suffix_link(): + self._compute_slink(x, u) + u = u._get_suffix_link() + d = d - 1 + if d < 0: + d = 0 + + def _create_node(self, x, u, d): + i = u.idx + p = u.parent + v = Suffix_Node(idx=i, depth=d) + v._add_transition_link(u, x[i + d]) + u.parent = v + p._add_transition_link(v, x[i + p.depth]) + v.parent = p + return v + + def _create_leaf(self, x, i, u, d): + w = Suffix_Node() + w.idx = i + w.depth = len(x) - i + u._add_transition_link(w, x[i + d]) + w.parent = u + return w + + def _compute_slink(self, x, u): + d = u.depth + v = u.parent._get_suffix_link() + while v.depth < d - 1: + v = v._get_transition_link(x[u.idx + v.depth + 1]) + if v.depth > d - 1: + v = self._create_node(x, v, d - 1) + u._add_suffix_link(v) + + def _build_generalized(self, xs): + terminal_gen = self._terminalSymbolsGenerator() + + _xs = ''.join([x + next(terminal_gen) for x in xs]) + self.word = _xs + self._generalized_word_starts(xs) + self._build(_xs) + self.root._traverse(self._label_generalized) + + def _label_generalized(self, node): + if node.is_leaf(): + x = {self._get_word_start_index(node.idx)} + else: + x = {n for ns in node.transition_links.values() for n in ns.generalized_idxs} + node.generalized_idxs = x + + def _get_word_start_index(self, idx): + i = 0 + for _idx in self.word_starts[1:]: + if idx < _idx: + return i + else: + i += 1 + return i + + def lcs(self, stringIdxs = -1): + if stringIdxs == -1 or not isinstance(stringIdxs, list): + stringIdxs = set(range(len(self.word_starts))) + else: + stringIdxs = set(stringIdxs) + deepestNode = self._find_lcs(self.root, stringIdxs) + start = deepestNode.idx + end = deepestNode.idx + deepestNode.depth + return self.word[start:end] + + def _find_lcs(self, node, stringIdxs): + nodes = [self._find_lcs(n, stringIdxs) + for n in node.transition_links.values() + if n.generalized_idxs.issuperset(stringIdxs)] + if nodes == []: + return node + deepestNode = max(nodes, key=lambda n: n.depth) + return deepestNode + + def _generalized_word_starts(self, xs): + self.word_starts = [] + i = 0 + for n in range(len(xs)): + self.word_starts.append(i) + i += len(xs[n]) + 1 + + def find(self, y): + node = self.root + while True: + edge = self._edgeLabel(node, node.parent) + if edge.startswith(y): + return node.idx + + i = 0 + while (i < len(edge) and edge[i] == y[0]): + y = y[1:] + i += 1 + + if i != 0: + if i == len(edge) and y != '': + pass + else: + return -1 + + node = node._get_transition_link(y[0]) + if not node: + return -1 + + def find_all(self, y): + node = self.root + while True: + edge = self._edgeLabel(node, node.parent) + if edge.startswith(y): + break + i = 0 + while (i < len(edge) and edge[i] == y[0]): + y = y[1:] + i += 1 + if i != 0: + if i == len(edge) and y != '': + pass + else: + return {} + node = node._get_transition_link(y[0]) + if not node: + return {} + + leaves = node._get_leaves() + return {n.idx for n in leaves} + + def _edgeLabel(self, node, parent): + return self.word[node.idx + parent.depth: node.idx + node.depth] + + def _terminalSymbolsGenerator(self): + UPPAs = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1))) + for i in UPPAs: + yield (chr(i)) + + raise ValueError("To many input strings.") diff --git a/pydatastructs/trees/tests/test_suffixtree.py b/pydatastructs/trees/tests/test_suffixtree.py new file mode 100644 index 000000000..f662df0e1 --- /dev/null +++ b/pydatastructs/trees/tests/test_suffixtree.py @@ -0,0 +1,7 @@ +from pydatastructs import SuffixTree + +def test_suffixtree(): + s = SuffixTree("HelloworldHe") + assert s.find("Hel") == 0 + assert s.find_all("He") == {0, 10} + assert s.find("Win") == -1 From 598330be39066094e47629bc713f183e7a93f2ed Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Tue, 2 Feb 2021 12:13:05 +0530 Subject: [PATCH 2/8] Let's see --- pydatastructs/strings/__init__.py | 7 ++ .../{trees => strings}/suffix_tree.py | 77 ++++--------------- .../strings/tests/test_suffixtree.py | 19 +++++ pydatastructs/trees/__init__.py | 8 +- pydatastructs/trees/tests/test_suffixtree.py | 7 -- pydatastructs/utils/__init__.py | 3 +- pydatastructs/utils/misc_util.py | 60 ++++++++++++++- 7 files changed, 105 insertions(+), 76 deletions(-) rename pydatastructs/{trees => strings}/suffix_tree.py (75%) create mode 100644 pydatastructs/strings/tests/test_suffixtree.py delete mode 100644 pydatastructs/trees/tests/test_suffixtree.py diff --git a/pydatastructs/strings/__init__.py b/pydatastructs/strings/__init__.py index 1ee05158f..f84cbe4b2 100644 --- a/pydatastructs/strings/__init__.py +++ b/pydatastructs/strings/__init__.py @@ -6,3 +6,10 @@ ) __all__.extend(trie.__all__) + +from . import suffix_tree +from .suffix_tree import( + SuffixTree +) + +__all__.extend(suffix_tree.__all__) diff --git a/pydatastructs/trees/suffix_tree.py b/pydatastructs/strings/suffix_tree.py similarity index 75% rename from pydatastructs/trees/suffix_tree.py rename to pydatastructs/strings/suffix_tree.py index 30c8ecfb4..7f205c3a4 100644 --- a/pydatastructs/trees/suffix_tree.py +++ b/pydatastructs/strings/suffix_tree.py @@ -1,56 +1,9 @@ +from pydatastructs.utils.misc_util import SuffixNode + __all__ = [ 'SuffixTree' ] -class Suffix_Node(): - - __slots__ = ['_suffix_link', 'transition_links', 'idx', 'depth', 'parent', 'generalized_idxs'] - - def __new__(cls, idx=-1, parentNode=None, depth=-1): - obj = object.__new__(cls) - obj._suffix_link = None - obj.transition_links = {} - obj.idx = idx - obj.depth = depth - obj.parent = parentNode - obj.generalized_idxs = {} - return obj - - def __str__(self): - return ("Suffix Node: idx:" + str(self.idx) + " depth:" + str(self.depth) + " transitons:" + str(list(self.transition_links.keys()))) - - def _add_suffix_link(self, snode): - self._suffix_link = snode - - def _get_suffix_link(self): - if self._suffix_link is not None: - return self._suffix_link - else: - return False - - def _get_transition_link(self, suffix): - return False if suffix not in self.transition_links else self.transition_links[suffix] - - def _add_transition_link(self, snode, suffix): - self.transition_links[suffix] = snode - - def _has_transition(self, suffix): - return suffix in self.transition_links - - def is_leaf(self): - return len(self.transition_links) == 0 - - def _traverse(self, f): - for node in self.transition_links.values(): - node._traverse(f) - f(self) - - def _get_leaves(self): - if self.is_leaf(): - return {self} - else: - return {x for n in self.transition_links.values() for x in n._get_leaves()} - class SuffixTree(): """ Represents Suffix Tree. @@ -58,12 +11,18 @@ class SuffixTree(): Examples ======== - >>> from pydatastructs.trees import SuffixTree as suffix + >>> from pydatastructs.strings import SuffixTree as suffix >>> s = suffix('hello') >>> s.find('he') 0 >>> s.find_all('l') {2, 3} + >>> s.find('f') + -1 + >>> lt=["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"] + >>> s1 = suffix(lt) + >>> s1.lcs() + 'abeced' References ========== @@ -74,7 +33,7 @@ class SuffixTree(): def __new__(cls, input=''): obj = object.__new__(cls) - obj.root = Suffix_Node() + obj.root = SuffixNode() obj.root.depth = 0 obj.root.idx = 0 obj.root.parent = obj.root @@ -85,23 +44,22 @@ def __new__(cls, input=''): @classmethod def methods(cls): - return ['__new__', '__str__', 'lcs', 'find', 'find_all'] + return ['__new__', 'lcs', 'find', 'find_all'] def _check_input(self, input): if isinstance(input, str): - return 'st' + return 'str' elif isinstance(input, list): if all(isinstance(item, str) for item in input): - return 'gst' - + return 'list' raise ValueError("String argument should be of type String or a list of strings") def build(self, x): type = self._check_input(x) - if type == 'st': + if type == 'str': x += next(self._terminalSymbolsGenerator()) self._build(x) - if type == 'gst': + if type == 'list': self._build_generalized(x) def _build(self, x): @@ -130,7 +88,7 @@ def _build_McCreight(self, x): def _create_node(self, x, u, d): i = u.idx p = u.parent - v = Suffix_Node(idx=i, depth=d) + v = SuffixNode(idx=i, depth=d) v._add_transition_link(u, x[i + d]) u.parent = v p._add_transition_link(v, x[i + p.depth]) @@ -138,7 +96,7 @@ def _create_node(self, x, u, d): return v def _create_leaf(self, x, i, u, d): - w = Suffix_Node() + w = SuffixNode() w.idx = i w.depth = len(x) - i u._add_transition_link(w, x[i + d]) @@ -256,5 +214,4 @@ def _terminalSymbolsGenerator(self): UPPAs = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1))) for i in UPPAs: yield (chr(i)) - raise ValueError("To many input strings.") diff --git a/pydatastructs/strings/tests/test_suffixtree.py b/pydatastructs/strings/tests/test_suffixtree.py new file mode 100644 index 000000000..4d07f2522 --- /dev/null +++ b/pydatastructs/strings/tests/test_suffixtree.py @@ -0,0 +1,19 @@ +from pydatastructs import SuffixTree +from pydatastructs.utils.raises_util import raises +import random, string + +def test_suffixtree(): + + s = SuffixTree("HelloworldHe") + assert s.find("Hel") == 0 + assert s.find_all("He") == {0, 10} + assert s.find("Win") == -1 + assert s.find_all("go") == {} + + f = ['integer', 'inteinteger', 'integralerint', 'iaingerntier', 'regetnerireg', 'reger'] + s = SuffixTree(f) + assert s.lcs() == 'er' + + assert raises(ValueError, lambda: SuffixTree(123)) + res = (100, 1, 0) + assert raises(ValueError, lambda: SuffixTree(res)) diff --git a/pydatastructs/trees/__init__.py b/pydatastructs/trees/__init__.py index ad0c26c5b..6b9df8a22 100644 --- a/pydatastructs/trees/__init__.py +++ b/pydatastructs/trees/__init__.py @@ -4,8 +4,7 @@ binary_trees, m_ary_trees, space_partitioning_trees, - heaps, - suffix_tree + heaps ) from .binary_trees import ( @@ -39,8 +38,3 @@ BinomialHeap ) __all__.extend(heaps.__all__) - -from .suffix_tree import( - SuffixTree -) -__all__.extend(suffix_tree.__all__) diff --git a/pydatastructs/trees/tests/test_suffixtree.py b/pydatastructs/trees/tests/test_suffixtree.py deleted file mode 100644 index f662df0e1..000000000 --- a/pydatastructs/trees/tests/test_suffixtree.py +++ /dev/null @@ -1,7 +0,0 @@ -from pydatastructs import SuffixTree - -def test_suffixtree(): - s = SuffixTree("HelloworldHe") - assert s.find("Hel") == 0 - assert s.find_all("He") == {0, 10} - assert s.find("Win") == -1 diff --git a/pydatastructs/utils/__init__.py b/pydatastructs/utils/__init__.py index da7ec6e7f..7aeffa86e 100644 --- a/pydatastructs/utils/__init__.py +++ b/pydatastructs/utils/__init__.py @@ -12,6 +12,7 @@ Set, CartesianTreeNode, RedBlackTreeNode, - TrieNode + TrieNode, + SuffixNode ) __all__.extend(misc_util.__all__) diff --git a/pydatastructs/utils/misc_util.py b/pydatastructs/utils/misc_util.py index ce449ddb2..82a0dea54 100644 --- a/pydatastructs/utils/misc_util.py +++ b/pydatastructs/utils/misc_util.py @@ -9,7 +9,8 @@ 'Set', 'CartesianTreeNode', 'RedBlackTreeNode', - 'TrieNode' + 'TrieNode', + 'SuffixNode' ] _check_type = lambda a, t: isinstance(a, t) @@ -446,3 +447,60 @@ def _comp(u, v, tcomp): return False else: return tcomp(u, v) + +class SuffixNode(Node): + """ + Represents nodes in the suffix tree data structure. + + Parameters + ========== + + string: The string to be stored in the tree. + Optional, by default None. + list: A list of strings to be stored in suffix tree. + Optional, by default None. + """ + + __slots__ = ['_suffix_link', 'transition_links', 'idx', 'depth', 'parent', 'generalized_idxs'] + + def __new__(cls, idx=-1, parentNode=None, depth=-1): + obj = object.__new__(cls) + obj._suffix_link = None + obj.transition_links = {} + obj.idx = idx + obj.depth = depth + obj.parent = parentNode + obj.generalized_idxs = {} + return obj + + def _add_suffix_link(self, snode): + self._suffix_link = snode + + def _get_suffix_link(self): + if self._suffix_link is not None: + return self._suffix_link + else: + return False + + def _get_transition_link(self, suffix): + return False if suffix not in self.transition_links else self.transition_links[suffix] + + def _add_transition_link(self, snode, suffix): + self.transition_links[suffix] = snode + + def _has_transition(self, suffix): + return suffix in self.transition_links + + def is_leaf(self): + return len(self.transition_links) == 0 + + def _traverse(self, f): + for node in self.transition_links.values(): + node._traverse(f) + f(self) + + def _get_leaves(self): + if self.is_leaf(): + return {self} + else: + return {x for n in self.transition_links.values() for x in n._get_leaves()} From 391302d96cfef3a4f9ff9acde49a9d083a263576 Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Tue, 2 Feb 2021 12:23:01 +0530 Subject: [PATCH 3/8] Adding ref --- pydatastructs/strings/tests/test_suffixtree.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pydatastructs/strings/tests/test_suffixtree.py b/pydatastructs/strings/tests/test_suffixtree.py index 4d07f2522..d412c00e4 100644 --- a/pydatastructs/strings/tests/test_suffixtree.py +++ b/pydatastructs/strings/tests/test_suffixtree.py @@ -3,7 +3,13 @@ import random, string def test_suffixtree(): + """ + References + ========== + .. https://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm + """ + s = SuffixTree("HelloworldHe") assert s.find("Hel") == 0 assert s.find_all("He") == {0, 10} From 7fd9da7fc310950d2a3cf87ac4290d4d41e865b5 Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Tue, 2 Feb 2021 12:25:12 +0530 Subject: [PATCH 4/8] Update test_suffixtree.py --- pydatastructs/strings/tests/test_suffixtree.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pydatastructs/strings/tests/test_suffixtree.py b/pydatastructs/strings/tests/test_suffixtree.py index d412c00e4..bc8dea16d 100644 --- a/pydatastructs/strings/tests/test_suffixtree.py +++ b/pydatastructs/strings/tests/test_suffixtree.py @@ -9,7 +9,6 @@ def test_suffixtree(): .. https://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm """ - s = SuffixTree("HelloworldHe") assert s.find("Hel") == 0 assert s.find_all("He") == {0, 10} From 68ef22936cdda2da0c55b957f41e0359562d0abd Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Fri, 5 Feb 2021 11:28:40 +0530 Subject: [PATCH 5/8] Adding docs --- pydatastructs/strings/suffix_tree.py | 58 +++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/pydatastructs/strings/suffix_tree.py b/pydatastructs/strings/suffix_tree.py index 7f205c3a4..26f9e627c 100644 --- a/pydatastructs/strings/suffix_tree.py +++ b/pydatastructs/strings/suffix_tree.py @@ -55,6 +55,19 @@ def _check_input(self, input): raise ValueError("String argument should be of type String or a list of strings") def build(self, x): + """ + Builds the Suffix tree on the given input. + + Parameters + ========== + + x: str or list of str + + Returns + ======= + + None + """ type = self._check_input(x) if type == 'str': x += next(self._terminalSymbolsGenerator()) @@ -114,7 +127,6 @@ def _compute_slink(self, x, u): def _build_generalized(self, xs): terminal_gen = self._terminalSymbolsGenerator() - _xs = ''.join([x + next(terminal_gen) for x in xs]) self.word = _xs self._generalized_word_starts(xs) @@ -138,6 +150,20 @@ def _get_word_start_index(self, idx): return i def lcs(self, stringIdxs = -1): + """ + Finds the Largest Common Substring of Strings provided in stringIdxs. + If stringIdxs is not provided, the LCS of all strings is returned. + + Parameters + ========== + + stringIdxs: int or list of int + + Returns + ======= + + Longest Common Substring + """ if stringIdxs == -1 or not isinstance(stringIdxs, list): stringIdxs = set(range(len(self.word_starts))) else: @@ -164,6 +190,21 @@ def _generalized_word_starts(self, xs): i += len(xs[n]) + 1 def find(self, y): + """ + Finds the starting position of the substring y in the string used for + building the Suffix tree. + + Parameters + ========== + + y: str + + Returns + ======= + + Index of the starting position of string y in the string used for building the Suffix tree + -1 if y is not a substring. + """ node = self.root while True: edge = self._edgeLabel(node, node.parent) @@ -186,6 +227,21 @@ def find(self, y): return -1 def find_all(self, y): + """ + Finds the starting position of the substring y in the string used for + building the Suffix tree. + + Parameters + ========== + + y: str + + Returns + ======= + + Set of Index of the starting positions of string y in the string used for building the Suffix tree + {} if y is not a substring. + """ node = self.root while True: edge = self._edgeLabel(node, node.parent) From 57fd9f990bb11654db0e90ae803e0b27ac35fc7a Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Sun, 7 Feb 2021 18:26:41 +0530 Subject: [PATCH 6/8] Fixed lcs --- pydatastructs/strings/suffix_tree.py | 4 ++-- pydatastructs/strings/tests/test_suffixtree.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pydatastructs/strings/suffix_tree.py b/pydatastructs/strings/suffix_tree.py index 26f9e627c..80a510910 100644 --- a/pydatastructs/strings/suffix_tree.py +++ b/pydatastructs/strings/suffix_tree.py @@ -21,7 +21,7 @@ class SuffixTree(): -1 >>> lt=["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"] >>> s1 = suffix(lt) - >>> s1.lcs() + >>> s1.longest_common_substring() 'abeced' References @@ -149,7 +149,7 @@ def _get_word_start_index(self, idx): i += 1 return i - def lcs(self, stringIdxs = -1): + def longest_common_substring(self, stringIdxs = -1): """ Finds the Largest Common Substring of Strings provided in stringIdxs. If stringIdxs is not provided, the LCS of all strings is returned. diff --git a/pydatastructs/strings/tests/test_suffixtree.py b/pydatastructs/strings/tests/test_suffixtree.py index bc8dea16d..2aabf143f 100644 --- a/pydatastructs/strings/tests/test_suffixtree.py +++ b/pydatastructs/strings/tests/test_suffixtree.py @@ -14,10 +14,14 @@ def test_suffixtree(): assert s.find_all("He") == {0, 10} assert s.find("Win") == -1 assert s.find_all("go") == {} + assert raises(AttributeError, lambda: s.longest_common_substring()) f = ['integer', 'inteinteger', 'integralerint', 'iaingerntier', 'regetnerireg', 'reger'] s = SuffixTree(f) - assert s.lcs() == 'er' + assert s.longest_common_substring() == 'er' + f = ['integer', 'inteinteger', 'integralerint', 'iainegerntier', 'regetnerireg', 'reger'] + s = SuffixTree(f) + assert s.longest_common_substring(7) == 'eg' assert raises(ValueError, lambda: SuffixTree(123)) res = (100, 1, 0) From cac71266da5cd912ed5359228c166c266d196c5b Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Fri, 12 Feb 2021 20:58:50 +0530 Subject: [PATCH 7/8] Fixing code --- pydatastructs/strings/suffix_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydatastructs/strings/suffix_tree.py b/pydatastructs/strings/suffix_tree.py index 80a510910..55e252779 100644 --- a/pydatastructs/strings/suffix_tree.py +++ b/pydatastructs/strings/suffix_tree.py @@ -4,7 +4,7 @@ 'SuffixTree' ] -class SuffixTree(): +class SuffixTree(object): """ Represents Suffix Tree. From b315c5e38d6dc3c2d9e2842b5d6c3960eca6a0f5 Mon Sep 17 00:00:00 2001 From: Arvind-raj06 Date: Sat, 13 Feb 2021 19:26:43 +0530 Subject: [PATCH 8/8] Fixing code and docs --- pydatastructs/strings/suffix_tree.py | 59 ++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/pydatastructs/strings/suffix_tree.py b/pydatastructs/strings/suffix_tree.py index 55e252779..8d9c4504f 100644 --- a/pydatastructs/strings/suffix_tree.py +++ b/pydatastructs/strings/suffix_tree.py @@ -47,6 +47,9 @@ def methods(cls): return ['__new__', 'lcs', 'find', 'find_all'] def _check_input(self, input): + """ + Check if the input is str ot list of str. + """ if isinstance(input, str): return 'str' elif isinstance(input, list): @@ -70,12 +73,15 @@ def build(self, x): """ type = self._check_input(x) if type == 'str': - x += next(self._terminalSymbolsGenerator()) + x += next(self._terminal_symbols_generator()) self._build(x) if type == 'list': self._build_generalized(x) def _build(self, x): + """ + Builds suffix tree with string. + """ self.word = x self._build_McCreight(x) @@ -99,6 +105,10 @@ def _build_McCreight(self, x): d = 0 def _create_node(self, x, u, d): + """ + Creates node for the suffix tree + with transition links. + """ i = u.idx p = u.parent v = SuffixNode(idx=i, depth=d) @@ -109,6 +119,10 @@ def _create_node(self, x, u, d): return v def _create_leaf(self, x, i, u, d): + """ + Creates the leaf node for the + suffix tree. + """ w = SuffixNode() w.idx = i w.depth = len(x) - i @@ -126,7 +140,11 @@ def _compute_slink(self, x, u): u._add_suffix_link(v) def _build_generalized(self, xs): - terminal_gen = self._terminalSymbolsGenerator() + """ + Builds the generalized suffix tree with list + of string. + """ + terminal_gen = self._terminal_symbols_generator() _xs = ''.join([x + next(terminal_gen) for x in xs]) self.word = _xs self._generalized_word_starts(xs) @@ -134,6 +152,10 @@ def _build_generalized(self, xs): self.root._traverse(self._label_generalized) def _label_generalized(self, node): + """ + Helper method that labels the nodes of GST with + indexes of strings found in their descendants. + """ if node.is_leaf(): x = {self._get_word_start_index(node.idx)} else: @@ -141,6 +163,10 @@ def _label_generalized(self, node): node.generalized_idxs = x def _get_word_start_index(self, idx): + """ + Helper method that returns the index of the + string based on node's starting index. + """ i = 0 for _idx in self.word_starts[1:]: if idx < _idx: @@ -174,6 +200,10 @@ def longest_common_substring(self, stringIdxs = -1): return self.word[start:end] def _find_lcs(self, node, stringIdxs): + """ + Helper method for longest common substring + of the labelled Generalized suffix tree. + """ nodes = [self._find_lcs(n, stringIdxs) for n in node.transition_links.values() if n.generalized_idxs.issuperset(stringIdxs)] @@ -183,6 +213,10 @@ def _find_lcs(self, node, stringIdxs): return deepestNode def _generalized_word_starts(self, xs): + """ + Helper method fidning the starting indexes + of strings in Generalized suffix tree. + """ self.word_starts = [] i = 0 for n in range(len(xs)): @@ -207,7 +241,7 @@ def find(self, y): """ node = self.root while True: - edge = self._edgeLabel(node, node.parent) + edge = self._edge_label(node, node.parent) if edge.startswith(y): return node.idx @@ -244,7 +278,7 @@ def find_all(self, y): """ node = self.root while True: - edge = self._edgeLabel(node, node.parent) + edge = self._edge_label(node, node.parent) if edge.startswith(y): break i = 0 @@ -263,11 +297,20 @@ def find_all(self, y): leaves = node._get_leaves() return {n.idx for n in leaves} - def _edgeLabel(self, node, parent): + def _edge_label(self, node, parent): + """ + Helper method returns the edge label + between a node and it's parent. + """ return self.word[node.idx + parent.depth: node.idx + node.depth] - def _terminalSymbolsGenerator(self): - UPPAs = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1))) - for i in UPPAs: + def _terminal_symbols_generator(self): + """ + Generator of unique terminal symbols used for building the Generalized Suffix Tree. + Unicode Private Use Area is used to ensure that terminal symbols are not part + of the input string. + """ + unicode = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1))) + for i in unicode: yield (chr(i)) raise ValueError("To many input strings.")