Filter subtoken matches in merge_subtokens() (explosion#4539)

The `Matcher` in `merge_subtokens()` returns all possible subsequences of `subtok`, so for sequences of two or more subtoks it's necessary to filter the matches so that the retokenizer is only merging the longest matches with no overlapping spans.
lucyio · Oct 28, 2019 · f2bfaa1 · f2bfaa1
1 parent d5509e0
commit f2bfaa1
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 1 deletion.
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
@@ -3,6 +3,7 @@
 
 from ..language import component
 from ..matcher import Matcher
+from ..util import filter_spans
 
 
 @component(
@@ -60,7 +61,7 @@ def merge_subtokens(doc, label="subtok"):
     merger = Matcher(doc.vocab)
     merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
     matches = merger(doc)
-    spans = [doc[start : end + 1] for _, start, end in matches]
+    spans = filter_spans([doc[start : end + 1] for _, start, end in matches])
     with doc.retokenize() as retokenizer:
         for span in spans:
             retokenizer.merge(span)

diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
@@ -0,0 +1,24 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.pipeline.functions import merge_subtokens
+from ..util import get_doc
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+    # fmt: off
+    text = "This is a sentence. This is another sentence. And a third."
+    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
+    deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
+            "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
+    # fmt: on
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_merge_subtokens(doc):
+    doc = merge_subtokens(doc)
+    # get_doc() doesn't set spaces, so the result is "And a third ."
+    assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]