Skip to content

Commit

Permalink
fixes various typos and mishaps
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Nov 5, 2023
1 parent d13e5d4 commit bc0e41c
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 12 deletions.
15 changes: 10 additions & 5 deletions stanza/models/tokenization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,13 @@ def postprocess_doc(doc, postprocessor, orig_text=None):
corrected_words.append(sent_words)
corrected_mwts.append(sent_mwts)
corrected_expansions.append(sent_expansions)

# check postprocessor output
token_lens = [len(i) for i in corrected_words]
mwt_lens = [len(i) for i in corrected_mwts]
assert token_lens == mwt_lens, "Postprocessor returned token and MWT lists of different length! Token list lengths %s, MWT list lengths %s" % (token_lens, mwt_lens)

# recassemble document. offsets and oov shouldn't change
# reassemble document. offsets and oov shouldn't change
doc = reassemble_doc_from_tokens(corrected_words, corrected_mwts,
corrected_expansions, raw_text)

Expand All @@ -396,10 +401,10 @@ def reassemble_doc_from_tokens(tokens, mwts, expansions, raw_text):
tokens : List[List[str]]
A list of sentences, which includes string tokens.
mwts : List[List[bool]]
Whether or not each of the tokens are MWTs to be analyzed by
the MWT raw.
mwts : List[List[List[str}]]
A list of possible expansions for MWTs
Whether or not each of the tokens are MWTs to be analyzed by the MWT system.
expansions : List[List[Optional[List[str]]]]
A list of possible expansions for MWTs, or None if no user-defined expansion
is given.
parser_text : str
The raw text off of which we can compare offsets.
Expand Down
1 change: 0 additions & 1 deletion stanza/pipeline/mwt_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""

import io
from stanza.resources.common import process_pipeline_parameters

import torch

Expand Down
9 changes: 3 additions & 6 deletions stanza/tests/tokenization/test_tokenize_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,12 @@ def test_reassembly_reference_failures():
text = "Joe Smith lives in California."

with pytest.raises(ValueError):
utils.reassemble_doc_from_tokens(bad_addition_tokenization, bad_addition_mwts,
bad_addition_expansions, text)
utils.reassemble_doc_from_tokens(bad_addition_tokenization, bad_addition_mwts, bad_addition_expansions, text)

with pytest.raises(ValueError):
utils.reassemble_doc_from_tokens(bad_inline_tokenization, bad_inline_mwts,
bad_inline_mwts, text)
utils.reassemble_doc_from_tokens(bad_inline_tokenization, bad_inline_mwts, bad_inline_mwts, text)

utils.reassemble_doc_from_tokens(good_tokenization, good_mwts,
good_expansions, text)
utils.reassemble_doc_from_tokens(good_tokenization, good_mwts, good_expansions, text)



Expand Down

0 comments on commit bc0e41c

Please sign in to comment.