Skip to content

Commit

Permalink
Restore the convert_dict method, as requested in #1329
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Jan 11, 2024
1 parent c4c3de5 commit bea2805
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 0 deletions.
19 changes: 19 additions & 0 deletions stanza/tests/common/test_data_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,3 +499,22 @@ def test_read_multiple_doc_ids():
assert len(docs[1].sentences) == 1
assert len(docs[2].sentences) == 2

ENGLISH_TEST_SENTENCE = """
# text = This is a test
# sent_id = 0
1 This this PRON DT Number=Sing|PronType=Dem 4 nsubj _ start_char=0|end_char=4
2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 4 cop _ start_char=5|end_char=7
3 a a DET DT Definite=Ind|PronType=Art 4 det _ start_char=8|end_char=9
4 test test NOUN NN Number=Sing 0 root _ start_char=10|end_char=14|SpaceAfter=No
""".lstrip()

def test_convert_dict():
doc = CoNLL.conll2doc(input_str=ENGLISH_TEST_SENTENCE)
converted = CoNLL.convert_dict(doc.to_dict())

expected = [[['1', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '4', 'nsubj', '_', 'start_char=0|end_char=4'],
['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', 'start_char=5|end_char=7'],
['3', 'a', 'a', 'DET', 'DT', 'Definite=Ind|PronType=Art', '4', 'det', '_', 'start_char=8|end_char=9'],
['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]

assert converted == expected
17 changes: 17 additions & 0 deletions stanza/utils/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,23 @@ def convert_conll(doc_conll):
doc_empty.append(sent_empty)
return doc_dict, doc_empty

@staticmethod
def convert_dict(doc_dict):
""" Convert the dictionary format input data to the CoNLL-U format output data.
This is the reverse function of `convert_conll`, but does not include sentence level annotations or comments.
Can call this on a Document using `CoNLL.convert_dict(doc.to_dict())`
Input: dictionary format data, which is a list of list of dictionaries for each token in each sentence in the data.
Output: CoNLL-U format data as a list of list of list for each token in each sentence in the data.
"""
doc = Document(doc_dict)
text = "{:c}".format(doc)
sentences = text.split("\n\n")
doc_conll = [[x.split("\t") for x in sentence.split("\n")] for sentence in sentences]
return doc_conll

@staticmethod
def convert_conll_token(token_conll):
""" Convert the CoNLL-U format input token to the dictionary format output token.
Expand Down

0 comments on commit bea2805

Please sign in to comment.