Skip to content

Commit

Permalink
Another attempt at doing SpacesAfter in the #text
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Oct 28, 2023
1 parent 609806d commit 4256ae6
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 18 deletions.
15 changes: 0 additions & 15 deletions stanza/models/common/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,21 +1023,6 @@ def _is_null(self, value):
def is_mwt(self):
return len(self.words) > 1

def space_after(self):
if self.misc and "SpaceAfter=No" in self.misc.split("|"):
return ""
if self.words[-1].misc and "SpaceAfter=No" in self.words[-1].misc.split("|"):
return ""
if self.misc:
for piece in self.misc.split("|"):
if piece.startswith("SpacesAfter="):
return piece.split("=", maxsplit=1)[1]
if self.words[-1].misc:
for piece in self.words[-1].misc.split("|"):
if piece.startswith("SpacesAfter="):
return piece.split("=", maxsplit=1)[1]
return " "

class Word(StanzaObject):
""" A word class that stores attributes of a word.
"""
Expand Down
3 changes: 3 additions & 0 deletions stanza/server/java_protobuf_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,9 @@ def misc_to_space_after(misc):
elif misc_space[pos:pos+2] == '\\\\':
spaces.append('\\')
pos += 2
elif misc_space[pos:pos+3] == '" "':
spaces.append(' ')
pos += 3
else:
spaces.append(misc_space[pos])
pos += 1
Expand Down
15 changes: 12 additions & 3 deletions stanza/server/ssurgeon.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,18 @@ def convert_response_to_doc(doc, semgrex_response):
old_comments = list(sentence.comments)
sentence = Sentence(mwt_tokens, doc)

token_text = [token.text if token_idx == len(sentence.tokens) - 1
else token.text + token.space_after().replace("\n", "")
for token_idx, token in enumerate(sentence.tokens)]
token_text = []
for token_idx, token in enumerate(sentence.tokens):
token_text.append(token.text)
if token_idx == len(sentence.tokens) - 1:
break
token_space_after = misc_to_space_after(token.misc)
if token_space_after == ' ':
# in some treebanks, the word might have more interesting
# space after annotations than the token
token_space_after = misc_to_space_after(word.misc)
token_text.append(token_space_after)

sentence_text = "".join(token_text)

for comment in old_comments:
Expand Down

0 comments on commit 4256ae6

Please sign in to comment.