diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py index 8750cb2bea..3e33567b8c 100644 --- a/stanza/models/common/doc.py +++ b/stanza/models/common/doc.py @@ -1023,21 +1023,6 @@ def _is_null(self, value): def is_mwt(self): return len(self.words) > 1 - def space_after(self): - if self.misc and "SpaceAfter=No" in self.misc.split("|"): - return "" - if self.words[-1].misc and "SpaceAfter=No" in self.words[-1].misc.split("|"): - return "" - if self.misc: - for piece in self.misc.split("|"): - if piece.startswith("SpacesAfter="): - return piece.split("=", maxsplit=1)[1] - if self.words[-1].misc: - for piece in self.words[-1].misc.split("|"): - if piece.startswith("SpacesAfter="): - return piece.split("=", maxsplit=1)[1] - return " " - class Word(StanzaObject): """ A word class that stores attributes of a word. """ diff --git a/stanza/server/java_protobuf_requests.py b/stanza/server/java_protobuf_requests.py index fedccb755d..d825f00047 100644 --- a/stanza/server/java_protobuf_requests.py +++ b/stanza/server/java_protobuf_requests.py @@ -380,6 +380,9 @@ def misc_to_space_after(misc): elif misc_space[pos:pos+2] == '\\\\': spaces.append('\\') pos += 2 + elif misc_space[pos:pos+3] == '" "': + spaces.append(' ') + pos += 3 else: spaces.append(misc_space[pos]) pos += 1 diff --git a/stanza/server/ssurgeon.py b/stanza/server/ssurgeon.py index a6bae16760..268df4250f 100644 --- a/stanza/server/ssurgeon.py +++ b/stanza/server/ssurgeon.py @@ -184,9 +184,18 @@ def convert_response_to_doc(doc, semgrex_response): old_comments = list(sentence.comments) sentence = Sentence(mwt_tokens, doc) - token_text = [token.text if token_idx == len(sentence.tokens) - 1 - else token.text + token.space_after().replace("\n", "") - for token_idx, token in enumerate(sentence.tokens)] + token_text = [] + for token_idx, token in enumerate(sentence.tokens): + token_text.append(token.text) + if token_idx == len(sentence.tokens) - 1: + break + token_space_after = misc_to_space_after(token.misc) + if token_space_after == ' ': + # in some treebanks, the word might have more interesting + # space after annotations than the token + token_space_after = misc_to_space_after(word.misc) + token_text.append(token_space_after) + sentence_text = "".join(token_text) for comment in old_comments: