Skip to content

Commit

Permalink
Translate NBSP to and from its UD escape code (UniversalDependencies/…
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Nov 1, 2023
1 parent 3c5b625 commit 81a73a8
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 2 deletions.
6 changes: 4 additions & 2 deletions stanza/server/java_protobuf_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ def space_after_to_misc(space):
spaces.append('\\p')
elif char == '\\':
spaces.append('\\\\')
elif char == ' ':
spaces.append('\\u00A0')
else:
spaces.append(char)
space_after = "".join(spaces)
Expand Down Expand Up @@ -380,9 +382,9 @@ def misc_to_space_after(misc):
elif misc_space[pos:pos+2] == '\\\\':
spaces.append('\\')
pos += 2
elif misc_space[pos:pos+3] == '" "':
elif misc_space[pos:pos+6] == '\\u00A0':
spaces.append(' ')
pos += 3
pos += 6
else:
spaces.append(misc_space[pos])
pos += 1
Expand Down
32 changes: 32 additions & 0 deletions stanza/tests/server/test_java_protobuf_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,35 @@ def test_convert_networkx_graph():
edges = sorted(edges)
expected_edges = [(1, 'cc'), (3, 'obl'), (5, 'advmod'), (6, 'obl'), (7, 'punct')]
assert edges == expected_edges

ENGLISH_NBSP_SAMPLE="""
# sent_id = newsgroup-groups.google.com_n3td3v_e874a1e5eb995654_ENG_20060120_052200-0011
# text = Please note that neither the e-mail address nor name of the sender have been verified.
1 Please please INTJ UH _ 2 discourse _ _
2 note note VERB VB Mood=Imp|VerbForm=Fin 0 root _ _
3 that that SCONJ IN _ 15 mark _ _
4 neither neither CCONJ CC _ 7 cc:preconj _ _
5 the the DET DT Definite=Def|PronType=Art 7 det _ _
6 e-mail e-mail NOUN NN Number=Sing 7 compound _ _
7 address address NOUN NN Number=Sing 15 nsubj:pass _ _
8 nor nor CCONJ CC _ 9 cc _ _
9 name name NOUN NN Number=Sing 7 conj _ _
10 of of ADP IN _ 12 case _ _
11 the the DET DT Definite=Def|PronType=Art 12 det _ _
12 sender sender NOUN NN Number=Sing 7 nmod _ _
13 have have AUX VBP Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 15 aux _ SpacesAfter=\\u00A0
14 been be AUX VBN Tense=Past|VerbForm=Part 15 aux:pass _ _
15 verified verify VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 2 ccomp _ SpaceAfter=No
16 . . PUNCT . _ 2 punct _ _
""".lstrip()

def test_nbsp_doc():
"""
Test that the space conversion methods will convert to and from NBSP
"""
doc = CoNLL.conll2doc(input_str=ENGLISH_NBSP_SAMPLE)

assert doc.sentences[0].text == "Please note that neither the e-mail address nor name of the sender have been verified."
assert doc.sentences[0].words[12].misc == "SpacesAfter=\\u00A0"
assert java_protobuf_requests.misc_to_space_after(doc.sentences[0].words[12].misc) == ' '
assert java_protobuf_requests.space_after_to_misc(' ') == "SpacesAfter=\\u00A0"

0 comments on commit 81a73a8

Please sign in to comment.