fixes various typos and mishaps

stanfordnlp · Nov 5, 2023 · bc0e41c · bc0e41c
1 parent d13e5d4
commit bc0e41c
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 12 deletions.
diff --git a/stanza/models/tokenization/utils.py b/stanza/models/tokenization/utils.py
@@ -381,8 +381,13 @@ def postprocess_doc(doc, postprocessor, orig_text=None):
         corrected_words.append(sent_words)
         corrected_mwts.append(sent_mwts)
         corrected_expansions.append(sent_expansions)
+
+    # check postprocessor output
+    token_lens = [len(i) for i in corrected_words]
+    mwt_lens = [len(i) for i in corrected_mwts]
+    assert token_lens == mwt_lens, "Postprocessor returned token and MWT lists of different length! Token list lengths %s, MWT list lengths %s" % (token_lens, mwt_lens)
 
-    # recassemble document. offsets and oov shouldn't change
+    # reassemble document. offsets and oov shouldn't change
     doc = reassemble_doc_from_tokens(corrected_words, corrected_mwts,
                                      corrected_expansions, raw_text)
 
@@ -396,10 +401,10 @@ def reassemble_doc_from_tokens(tokens, mwts, expansions, raw_text):
     tokens : List[List[str]]
         A list of sentences, which includes string tokens.
     mwts : List[List[bool]]
-        Whether or not each of the tokens are MWTs to be analyzed by
-        the MWT raw.
-    mwts : List[List[List[str}]]
-        A list of possible expansions for MWTs
+        Whether or not each of the tokens are MWTs to be analyzed by the MWT system.
+    expansions : List[List[Optional[List[str]]]]
+        A list of possible expansions for MWTs, or None if no user-defined expansion
+        is given.
     parser_text : str
         The raw text off of which we can compare offsets.
 

diff --git a/stanza/pipeline/mwt_processor.py b/stanza/pipeline/mwt_processor.py
@@ -3,7 +3,6 @@
 """
 
 import io
-from stanza.resources.common import process_pipeline_parameters
 
 import torch
 

diff --git a/stanza/tests/tokenization/test_tokenize_utils.py b/stanza/tests/tokenization/test_tokenize_utils.py
@@ -148,15 +148,12 @@ def test_reassembly_reference_failures():
     text = "Joe Smith lives in California."
 
     with pytest.raises(ValueError):
-        utils.reassemble_doc_from_tokens(bad_addition_tokenization, bad_addition_mwts,
-                                         bad_addition_expansions, text)
+        utils.reassemble_doc_from_tokens(bad_addition_tokenization, bad_addition_mwts, bad_addition_expansions, text)
 
     with pytest.raises(ValueError):
-        utils.reassemble_doc_from_tokens(bad_inline_tokenization, bad_inline_mwts,
-                                         bad_inline_mwts, text)
+        utils.reassemble_doc_from_tokens(bad_inline_tokenization, bad_inline_mwts, bad_inline_mwts, text)
 
-    utils.reassemble_doc_from_tokens(good_tokenization, good_mwts,
-                                     good_expansions, text)
+    utils.reassemble_doc_from_tokens(good_tokenization, good_mwts, good_expansions, text)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,6 @@ @@
     """
     import io
-    from stanza.resources.common import process_pipeline_parameters
     import torch
@@ Expand Down @@