Skip to content

Commit

Permalink
Fix realloc in retokenizer.split() (explosion#4606)
Browse files Browse the repository at this point in the history
Always realloc to a size larger than `doc.max_length` in
`retokenizer.split()` (or cymem will throw errors).
  • Loading branch information
adrianeboyd authored and honnibal committed Nov 11, 2019
1 parent f415e9b commit 91f89f9
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
15 changes: 15 additions & 0 deletions spacy/tests/doc/test_retokenize_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,18 @@ def test_doc_retokenizer_split_lex_attrs(en_vocab):
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
assert doc[0].is_stop
assert not doc[1].is_stop


def test_doc_retokenizer_realloc(en_vocab):
"""#4604: realloc correctly when new tokens outnumber original tokens"""
text = "Hyperglycemic adverse events following antipsychotic drug administration in the"
doc = Doc(en_vocab, words=text.split()[:-1])
with doc.retokenize() as retokenizer:
token = doc[0]
heads = [(token, 0)] * len(token)
retokenizer.split(doc[token.i], list(token.text), heads=heads)
doc = Doc(en_vocab, words=text.split())
with doc.retokenize() as retokenizer:
token = doc[0]
heads = [(token, 0)] * len(token)
retokenizer.split(doc[token.i], list(token.text), heads=heads)
2 changes: 1 addition & 1 deletion spacy/tokens/_retokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
doc.c[i].head += offset
# Double doc.c max_length if necessary (until big enough for all new tokens)
while doc.length + nb_subtokens - 1 >= doc.max_length:
doc._realloc(doc.length * 2)
doc._realloc(doc.max_length * 2)
# Move tokens after the split to create space for the new tokens
doc.length = len(doc) + nb_subtokens -1
to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)
Expand Down

0 comments on commit 91f89f9

Please sign in to comment.