Send enhanced dependencies to Ssurgeon if the document has them.

Attach the enhanced dependencies to the sentence after running Ssurgeon. Need to test this a bunch Do a better job of tracking if the words are the same Convert the empty words (if any) from the returned Ssurgeon and attach them to the Sentence
stanfordnlp · Oct 28, 2023 · 0f12d49 · 0f12d49
1 parent b0a227b
commit 0f12d49
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 19 deletions.
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
@@ -474,6 +474,14 @@ def has_enhanced_dependencies(self):
         """
         return self._enhanced_dependencies is not None and len(self._enhanced_dependencies) > 0
 
+    @property
+    def enhanced_dependencies(self):
+        graph = self._enhanced_dependencies
+        if graph is None:
+            graph = nx.MultiDiGraph()
+            self._enhanced_dependencies = graph
+        return graph
+
     @property
     def index(self):
         """

diff --git a/stanza/server/ssurgeon.py b/stanza/server/ssurgeon.py
@@ -9,6 +9,7 @@
 
 import argparse
 import copy
+import logging
 import os
 import re
 import sys
@@ -19,8 +20,11 @@
 
 from stanza.models.common.doc import ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, START_CHAR, END_CHAR, NER, Word, Token, Sentence
 
+logger = logging.getLogger('stanza')
+
 SSURGEON_JAVA = "edu.stanford.nlp.semgraph.semgrex.ssurgeon.ProcessSsurgeonRequest"
 
+
 class SsurgeonEdit:
     def __init__(self, semgrex_pattern, ssurgeon_edits, ssurgeon_id=None, notes=None, language="UniversalEnglish"):
         # not a named tuple so we can have defaults without requiring a python upgrade
@@ -79,6 +83,12 @@ def build_request(doc, ssurgeon_edits):
                     java_protobuf_requests.add_word_to_graph(graph, word, sent_idx, word_idx)
 
                     word_idx = word_idx + 1
+            if sentence.has_enhanced_dependencies():
+                graph = request.graph.add()
+                for token in sentence.tokens:
+                    for word in token.words:
+                        java_protobuf_requests.add_token(graph.token, word, token)
+                java_protobuf_requests.convert_networkx_graph(graph, sentence, sent_idx)
     except Exception as e:
         raise RuntimeError("Failed to process sentence {}:\n{:C}".format(sent_idx, sentence)) from e
 
@@ -103,7 +113,12 @@ def process_doc_one_operation(doc, semgrex_pattern, ssurgeon_edits, ssurgeon_id=
 
     return send_ssurgeon_request(request)
 
-def build_word_entry(word_index, graph_word):
+def build_word_entry(graph_word):
+    if graph_word.emptyIndex:
+        word_index = (graph_word.index, graph_word.emptyIndex)
+    else:
+        word_index = graph_word.index
+
     word_entry = {
         ID: word_index,
         TEXT: graph_word.word if graph_word.word else None,
@@ -132,7 +147,13 @@ def build_word_entry(word_index, graph_word):
 def convert_response_to_doc(doc, semgrex_response):
     doc = copy.deepcopy(doc)
     try:
-        for sent_idx, (sentence, ssurgeon_result) in enumerate(zip(doc.sentences, semgrex_response.result)):
+        sent_idx = 0
+        response_idx = 0
+        while sent_idx < len(doc.sentences):
+            sentence = doc.sentences[sent_idx]
+            ssurgeon_result = semgrex_response.result[response_idx]
+            has_enhanced = sentence.has_enhanced_dependencies()
+
             # EditNode is currently bugged... :/
             # TODO: change this after next CoreNLP release (after 4.5.3)
             #if not ssurgeon_result.changed:
@@ -141,7 +162,7 @@ def convert_response_to_doc(doc, semgrex_response):
             ssurgeon_graph = ssurgeon_result.graph
             tokens = []
             for graph_node, graph_word in zip(ssurgeon_graph.node, ssurgeon_graph.token):
-                word_entry = build_word_entry(graph_node.index, graph_word)
+                word_entry = build_word_entry(graph_word)
                 tokens.append(word_entry)
             tokens.sort(key=lambda x: x[ID])
             for root in ssurgeon_graph.root:
@@ -207,6 +228,57 @@ def convert_response_to_doc(doc, semgrex_response):
             doc.sentences[sent_idx] = sentence
 
             sentence.rebuild_dependencies()
+
+            sent_idx += 1
+            response_idx += 1
+
+            if has_enhanced:
+                enhanced_ssurgeon_graph = semgrex_response.result[response_idx].graph
+                response_idx += 1
+
+                enhanced_words_map = {}
+                for node_idx, node in enumerate(enhanced_ssurgeon_graph.node):
+                    if node.emptyIndex:
+                        continue
+                    enhanced_words_map[node.index-1] = enhanced_ssurgeon_graph.token[node_idx].word
+                if any(expected_idx != idx for expected_idx, idx in enumerate(sorted(enhanced_words_map.keys()))):
+                    logger.warning("Sentence %d had gap in indices of the enhanced graph!")
+                    continue
+                enhanced_words = []
+                for index, word in sorted(enhanced_words_map.items()):
+                    enhanced_words.append(word)
+                if (len(sentence.words) != len(enhanced_words) or
+                    any(word.text != enhanced_word for word, enhanced_word in zip(sentence.words, enhanced_words))):
+                    logger.warning("Sentence %d had different words in the enhanced graph compared to the basic graph after running the ssurgeon!", sent_idx)
+                    continue
+                # yay, the words match at a very basic level
+                # first need to add any extra words
+                empty_words = []
+                for token in enhanced_ssurgeon_graph.token:
+                    if not token.emptyIndex:
+                        continue
+                    word_entry = build_word_entry(token)
+                    empty_words.append(Word(sentence, word_entry))
+                sentence.empty_words = empty_words
+
+                # next add the edges
+                for edge in enhanced_ssurgeon_graph.edge:
+                    if edge.sourceEmpty:
+                        source = (edge.source, edge.sourceEmpty)
+                    else:
+                        source = edge.source
+                    if edge.targetEmpty:
+                        target = (edge.target, edge.targetEmpty)
+                    else:
+                        target = edge.target
+                    sentence.enhanced_dependencies.add_edge(source, target, edge.dep)
+                for root in enhanced_ssurgeon_graph.rootNode:
+                    root = enhanced_ssurgeon_graph.node[root]
+                    if root.emptyIndex:
+                        root = (root.index, root.emptyIndex)
+                    else:
+                        root = root.index
+                    sentence.enhanced_dependencies.add_edge(0, root, "root")
     except Exception as e:
         raise RuntimeError("Ssurgeon could not process sentence {}\nSsurgeon result:\n{}\nOriginal sentence:\n{:C}".format(sent_idx, ssurgeon_result, sentence)) from e
     return doc
@@ -279,7 +351,7 @@ def main():
         ssurgeon_edits = [SsurgeonEdit(args.semgrex, args.ssurgeon)]
 
     if args.input_file:
-        docs = [CoNLL.conll2doc(input_file=args.input_file)]
+        docs = [CoNLL.conll2doc(input_file=args.input_file, ignore_gapping=False)]
         outputs = [args.output_file]
         input_output = zip(docs, outputs)
     elif args.input_dir:
@@ -295,7 +367,7 @@ def read_docs():
                 doc_path = os.path.join(args.input_dir, doc_filename)
                 output_path = os.path.join(args.output_dir, doc_filename)
                 print("Processing %s to %s" % (doc_path, output_path))
-                yield CoNLL.conll2doc(input_file=doc_path), output_path
+                yield CoNLL.conll2doc(input_file=doc_path, ignore_gapping=False), output_path
         input_output = read_docs()
     else:
         docs = [CoNLL.conll2doc(input_str=SAMPLE_DOC)]

diff --git a/stanza/tests/server/test_ssurgeon.py b/stanza/tests/server/test_ssurgeon.py
@@ -141,19 +141,6 @@ def test_ssurgeon_become_mwt():
 6	”	"	PUNCT	''	_	4	punct	4:punct	_
 """
 
-# TODO: also, we shouldn't lose the enhanced dependencies...
-EXISTING_MWT_DOC_EXPECTED = """
-# sent_id = newsgroup-groups.google.com_GayMarriage_0ccbb50b41a5830b_ENG_20050321_181500-0005
-# text = One of “NCRC4ME’s”
-1	One	one	NUM	CD	NumType=Card	0	root	_	_
-2	of	of	ADP	IN	_	4	case	_	_
-3	“	"	PUNCT	``	_	4	punct	_	SpaceAfter=No
-4-5	NCRC4ME’s	_	_	_	_	_	_	_	SpaceAfter=No
-4	NCRC4ME	NCRC4ME	PROPN	NNP	Number=Sing	1	compound	_	_
-5	’s	's	PART	POS	_	4	case	_	_
-6	”	"	PUNCT	''	_	4	punct	_	_
-"""
-
 def test_ssurgeon_existing_mwt_no_change():
     """
     Test that converting a document with an MWT works as expected
@@ -176,7 +163,7 @@ def test_ssurgeon_existing_mwt_no_change():
     updated_doc = ssurgeon.convert_response_to_doc(doc, ssurgeon_response)
 
     result = "{:C}".format(updated_doc)
-    compare_ignoring_whitespace(result, EXISTING_MWT_DOC_EXPECTED)
+    compare_ignoring_whitespace(result, EXISTING_MWT_DOC_INPUT)
 
 def check_empty_test(input_text, expected=None, echo=False):
     if expected is None: