Skip to content

Commit

Permalink
Filter subtoken matches in merge_subtokens() (explosion#4539)
Browse files Browse the repository at this point in the history
The `Matcher` in `merge_subtokens()` returns all possible subsequences
of `subtok`, so for sequences of two or more subtoks it's necessary to
filter the matches so that the retokenizer is only merging the longest
matches with no overlapping spans.
  • Loading branch information
adrianeboyd authored and ines committed Oct 28, 2019
1 parent d5509e0 commit f2bfaa1
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
3 changes: 2 additions & 1 deletion spacy/pipeline/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from ..language import component
from ..matcher import Matcher
from ..util import filter_spans


@component(
Expand Down Expand Up @@ -60,7 +61,7 @@ def merge_subtokens(doc, label="subtok"):
merger = Matcher(doc.vocab)
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
spans = filter_spans([doc[start : end + 1] for _, start, end in matches])
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
Expand Down
24 changes: 24 additions & 0 deletions spacy/tests/pipeline/test_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.pipeline.functions import merge_subtokens
from ..util import get_doc


@pytest.fixture
def doc(en_tokenizer):
# fmt: off
text = "This is a sentence. This is another sentence. And a third."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
"subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
# fmt: on
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)


def test_merge_subtokens(doc):
doc = merge_subtokens(doc)
# get_doc() doesn't set spaces, so the result is "And a third ."
assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]

0 comments on commit f2bfaa1

Please sign in to comment.