From a1ea62c252d5fcd6f4b97acfc432b3951d62ef2d Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 29 Mar 2023 22:50:01 -0700 Subject: [PATCH] Send enhanced dependencies to Ssurgeon if the document has them. Attach the enhanced dependencies to the sentence after running Ssurgeon. Need to test this a bunch Do a better job of tracking if the words are the same Convert the empty words (if any) from the returned Ssurgeon and attach them to the Sentence --- stanza/models/common/doc.py | 8 +++ stanza/server/ssurgeon.py | 80 ++++++++++++++++++++++++++-- stanza/tests/server/test_ssurgeon.py | 15 +----- 3 files changed, 85 insertions(+), 18 deletions(-) diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py index 3e33567b8c..b32d24582f 100644 --- a/stanza/models/common/doc.py +++ b/stanza/models/common/doc.py @@ -474,6 +474,14 @@ def has_enhanced_dependencies(self): """ return self._enhanced_dependencies is not None and len(self._enhanced_dependencies) > 0 + @property + def enhanced_dependencies(self): + graph = self._enhanced_dependencies + if graph is None: + graph = nx.MultiDiGraph() + self._enhanced_dependencies = graph + return graph + @property def index(self): """ diff --git a/stanza/server/ssurgeon.py b/stanza/server/ssurgeon.py index 3e62688786..ad66f974d1 100644 --- a/stanza/server/ssurgeon.py +++ b/stanza/server/ssurgeon.py @@ -9,6 +9,7 @@ import argparse import copy +import logging import os import re import sys @@ -19,8 +20,11 @@ from stanza.models.common.doc import ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, START_CHAR, END_CHAR, NER, Word, Token, Sentence +logger = logging.getLogger('stanza') + SSURGEON_JAVA = "edu.stanford.nlp.semgraph.semgrex.ssurgeon.ProcessSsurgeonRequest" + class SsurgeonEdit: def __init__(self, semgrex_pattern, ssurgeon_edits, ssurgeon_id=None, notes=None, language="UniversalEnglish"): # not a named tuple so we can have defaults without requiring a python upgrade @@ -79,6 +83,12 @@ def build_request(doc, ssurgeon_edits): java_protobuf_requests.add_word_to_graph(graph, word, sent_idx, word_idx) word_idx = word_idx + 1 + if sentence.has_enhanced_dependencies(): + graph = request.graph.add() + for token in sentence.tokens: + for word in token.words: + java_protobuf_requests.add_token(graph.token, word, token) + java_protobuf_requests.convert_networkx_graph(graph, sentence, sent_idx) except Exception as e: raise RuntimeError("Failed to process sentence {}:\n{:C}".format(sent_idx, sentence)) from e @@ -103,7 +113,12 @@ def process_doc_one_operation(doc, semgrex_pattern, ssurgeon_edits, ssurgeon_id= return send_ssurgeon_request(request) -def build_word_entry(word_index, graph_word): +def build_word_entry(graph_word): + if graph_word.emptyIndex: + word_index = (graph_word.index, graph_word.emptyIndex) + else: + word_index = graph_word.index + word_entry = { ID: word_index, TEXT: graph_word.word if graph_word.word else None, @@ -132,7 +147,13 @@ def build_word_entry(word_index, graph_word): def convert_response_to_doc(doc, semgrex_response): doc = copy.deepcopy(doc) try: - for sent_idx, (sentence, ssurgeon_result) in enumerate(zip(doc.sentences, semgrex_response.result)): + sent_idx = 0 + response_idx = 0 + while sent_idx < len(doc.sentences): + sentence = doc.sentences[sent_idx] + ssurgeon_result = semgrex_response.result[response_idx] + has_enhanced = sentence.has_enhanced_dependencies() + # EditNode is currently bugged... :/ # TODO: change this after next CoreNLP release (after 4.5.3) #if not ssurgeon_result.changed: @@ -141,7 +162,7 @@ def convert_response_to_doc(doc, semgrex_response): ssurgeon_graph = ssurgeon_result.graph tokens = [] for graph_node, graph_word in zip(ssurgeon_graph.node, ssurgeon_graph.token): - word_entry = build_word_entry(graph_node.index, graph_word) + word_entry = build_word_entry(graph_word) tokens.append(word_entry) tokens.sort(key=lambda x: x[ID]) for root in ssurgeon_graph.root: @@ -200,6 +221,57 @@ def convert_response_to_doc(doc, semgrex_response): doc.sentences[sent_idx] = sentence sentence.rebuild_dependencies() + + sent_idx += 1 + response_idx += 1 + + if has_enhanced: + enhanced_ssurgeon_graph = semgrex_response.result[response_idx].graph + response_idx += 1 + + enhanced_words_map = {} + for node_idx, node in enumerate(enhanced_ssurgeon_graph.node): + if node.emptyIndex: + continue + enhanced_words_map[node.index-1] = enhanced_ssurgeon_graph.token[node_idx].word + if any(expected_idx != idx for expected_idx, idx in enumerate(sorted(enhanced_words_map.keys()))): + logger.warning("Sentence %d had gap in indices of the enhanced graph!") + continue + enhanced_words = [] + for index, word in sorted(enhanced_words_map.items()): + enhanced_words.append(word) + if (len(sentence.words) != len(enhanced_words) or + any(word.text != enhanced_word for word, enhanced_word in zip(sentence.words, enhanced_words))): + logger.warning("Sentence %d had different words in the enhanced graph compared to the basic graph after running the ssurgeon!", sent_idx) + continue + # yay, the words match at a very basic level + # first need to add any extra words + empty_words = [] + for token in enhanced_ssurgeon_graph.token: + if not token.emptyIndex: + continue + word_entry = build_word_entry(token) + empty_words.append(Word(sentence, word_entry)) + sentence.empty_words = empty_words + + # next add the edges + for edge in enhanced_ssurgeon_graph.edge: + if edge.sourceEmpty: + source = (edge.source, edge.sourceEmpty) + else: + source = edge.source + if edge.targetEmpty: + target = (edge.target, edge.targetEmpty) + else: + target = edge.target + sentence.enhanced_dependencies.add_edge(source, target, edge.dep) + for root in enhanced_ssurgeon_graph.rootNode: + root = enhanced_ssurgeon_graph.node[root] + if root.emptyIndex: + root = (root.index, root.emptyIndex) + else: + root = root.index + sentence._enhanced_dependencies.add_edge(0, root, "root") except Exception as e: raise RuntimeError("Ssurgeon could not process sentence {}\nSsurgeon result:\n{}\nOriginal sentence:\n{:C}".format(sent_idx, ssurgeon_result, sentence)) from e return doc @@ -272,7 +344,7 @@ def main(): ssurgeon_edits = [SsurgeonEdit(args.semgrex, args.ssurgeon)] if args.input_file: - docs = [CoNLL.conll2doc(input_file=args.input_file)] + docs = [CoNLL.conll2doc(input_file=args.input_file, ignore_gapping=False)] outputs = [args.output_file] input_output = zip(docs, outputs) elif args.input_dir: diff --git a/stanza/tests/server/test_ssurgeon.py b/stanza/tests/server/test_ssurgeon.py index f18e9d74a8..2013cd4f27 100644 --- a/stanza/tests/server/test_ssurgeon.py +++ b/stanza/tests/server/test_ssurgeon.py @@ -141,19 +141,6 @@ def test_ssurgeon_become_mwt(): 6 ” " PUNCT '' _ 4 punct 4:punct _ """ -# TODO: also, we shouldn't lose the enhanced dependencies... -EXISTING_MWT_DOC_EXPECTED = """ -# sent_id = newsgroup-groups.google.com_GayMarriage_0ccbb50b41a5830b_ENG_20050321_181500-0005 -# text = One of “NCRC4ME’s” -1 One one NUM CD NumType=Card 0 root _ _ -2 of of ADP IN _ 4 case _ _ -3 “ " PUNCT `` _ 4 punct _ SpaceAfter=No -4-5 NCRC4ME’s _ _ _ _ _ _ _ SpaceAfter=No -4 NCRC4ME NCRC4ME PROPN NNP Number=Sing 1 compound _ _ -5 ’s 's PART POS _ 4 case _ _ -6 ” " PUNCT '' _ 4 punct _ _ -""" - def test_ssurgeon_existing_mwt_no_change(): """ Test that converting a document with an MWT works as expected @@ -176,7 +163,7 @@ def test_ssurgeon_existing_mwt_no_change(): updated_doc = ssurgeon.convert_response_to_doc(doc, ssurgeon_response) result = "{:C}".format(updated_doc) - compare_ignoring_whitespace(result, EXISTING_MWT_DOC_EXPECTED) + compare_ignoring_whitespace(result, EXISTING_MWT_DOC_INPUT) def check_empty_test(input_text, expected=None, echo=False): if expected is None: