diff --git a/stanza/models/tokenization/utils.py b/stanza/models/tokenization/utils.py index c868d4321d..52096e0647 100644 --- a/stanza/models/tokenization/utils.py +++ b/stanza/models/tokenization/utils.py @@ -381,8 +381,13 @@ def postprocess_doc(doc, postprocessor, orig_text=None): corrected_words.append(sent_words) corrected_mwts.append(sent_mwts) corrected_expansions.append(sent_expansions) + + # check postprocessor output + token_lens = [len(i) for i in corrected_words] + mwt_lens = [len(i) for i in corrected_mwts] + assert token_lens == mwt_lens, "Postprocessor returned token and MWT lists of different length! Token list lengths %s, MWT list lengths %s" % (token_lens, mwt_lens) - # recassemble document. offsets and oov shouldn't change + # reassemble document. offsets and oov shouldn't change doc = reassemble_doc_from_tokens(corrected_words, corrected_mwts, corrected_expansions, raw_text) @@ -396,10 +401,10 @@ def reassemble_doc_from_tokens(tokens, mwts, expansions, raw_text): tokens : List[List[str]] A list of sentences, which includes string tokens. mwts : List[List[bool]] - Whether or not each of the tokens are MWTs to be analyzed by - the MWT raw. - mwts : List[List[List[str}]] - A list of possible expansions for MWTs + Whether or not each of the tokens are MWTs to be analyzed by the MWT system. + expansions : List[List[Optional[List[str]]]] + A list of possible expansions for MWTs, or None if no user-defined expansion + is given. parser_text : str The raw text off of which we can compare offsets. diff --git a/stanza/pipeline/mwt_processor.py b/stanza/pipeline/mwt_processor.py index 7eff70f2db..3fbfc6fd25 100644 --- a/stanza/pipeline/mwt_processor.py +++ b/stanza/pipeline/mwt_processor.py @@ -3,7 +3,6 @@ """ import io -from stanza.resources.common import process_pipeline_parameters import torch diff --git a/stanza/tests/tokenization/test_tokenize_utils.py b/stanza/tests/tokenization/test_tokenize_utils.py index c8d2ab8ce7..a5db000046 100644 --- a/stanza/tests/tokenization/test_tokenize_utils.py +++ b/stanza/tests/tokenization/test_tokenize_utils.py @@ -148,15 +148,12 @@ def test_reassembly_reference_failures(): text = "Joe Smith lives in California." with pytest.raises(ValueError): - utils.reassemble_doc_from_tokens(bad_addition_tokenization, bad_addition_mwts, - bad_addition_expansions, text) + utils.reassemble_doc_from_tokens(bad_addition_tokenization, bad_addition_mwts, bad_addition_expansions, text) with pytest.raises(ValueError): - utils.reassemble_doc_from_tokens(bad_inline_tokenization, bad_inline_mwts, - bad_inline_mwts, text) + utils.reassemble_doc_from_tokens(bad_inline_tokenization, bad_inline_mwts, bad_inline_mwts, text) - utils.reassemble_doc_from_tokens(good_tokenization, good_mwts, - good_expansions, text) + utils.reassemble_doc_from_tokens(good_tokenization, good_mwts, good_expansions, text)