diff --git a/stanza/tests/pipeline/test_tokenizer.py b/stanza/tests/pipeline/test_tokenizer.py index c0dcc5fa81..949db54ac6 100644 --- a/stanza/tests/pipeline/test_tokenizer.py +++ b/stanza/tests/pipeline/test_tokenizer.py @@ -26,6 +26,32 @@ ]> ]> +]> +]> +]> +]> +]> +]> +]> +""".strip() + +# spaCy doesn't have MWT +EN_DOC_SPACY_TOKENS = """ +]> +]> +]> +]> +]> +]> + +]> +]> +]> +]> +]> +]> +]> + ]> ]> ]> @@ -357,7 +383,7 @@ def test_spacy(): # make sure the loaded tokenizer is actually spacy assert "SpacyTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ - assert EN_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) + assert EN_DOC_SPACY_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_spacy_no_ssplit():