Skip to content

Commit

Permalink
Send enhanced dependencies to Ssurgeon if the document has them.
Browse files Browse the repository at this point in the history
Attach the enhanced dependencies to the sentence after running Ssurgeon.
Need to test this a bunch

Do a better job of tracking if the words are the same
Convert the empty words (if any) from the returned Ssurgeon and attach them to the Sentence
  • Loading branch information
AngledLuffa committed Oct 28, 2023
1 parent b0a227b commit 0f12d49
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 19 deletions.
8 changes: 8 additions & 0 deletions stanza/models/common/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,14 @@ def has_enhanced_dependencies(self):
"""
return self._enhanced_dependencies is not None and len(self._enhanced_dependencies) > 0

@property
def enhanced_dependencies(self):
graph = self._enhanced_dependencies
if graph is None:
graph = nx.MultiDiGraph()
self._enhanced_dependencies = graph
return graph

@property
def index(self):
"""
Expand Down
82 changes: 77 additions & 5 deletions stanza/server/ssurgeon.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import argparse
import copy
import logging
import os
import re
import sys
Expand All @@ -19,8 +20,11 @@

from stanza.models.common.doc import ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, START_CHAR, END_CHAR, NER, Word, Token, Sentence

logger = logging.getLogger('stanza')

SSURGEON_JAVA = "edu.stanford.nlp.semgraph.semgrex.ssurgeon.ProcessSsurgeonRequest"


class SsurgeonEdit:
def __init__(self, semgrex_pattern, ssurgeon_edits, ssurgeon_id=None, notes=None, language="UniversalEnglish"):
# not a named tuple so we can have defaults without requiring a python upgrade
Expand Down Expand Up @@ -79,6 +83,12 @@ def build_request(doc, ssurgeon_edits):
java_protobuf_requests.add_word_to_graph(graph, word, sent_idx, word_idx)

word_idx = word_idx + 1
if sentence.has_enhanced_dependencies():
graph = request.graph.add()
for token in sentence.tokens:
for word in token.words:
java_protobuf_requests.add_token(graph.token, word, token)
java_protobuf_requests.convert_networkx_graph(graph, sentence, sent_idx)
except Exception as e:
raise RuntimeError("Failed to process sentence {}:\n{:C}".format(sent_idx, sentence)) from e

Expand All @@ -103,7 +113,12 @@ def process_doc_one_operation(doc, semgrex_pattern, ssurgeon_edits, ssurgeon_id=

return send_ssurgeon_request(request)

def build_word_entry(word_index, graph_word):
def build_word_entry(graph_word):
if graph_word.emptyIndex:
word_index = (graph_word.index, graph_word.emptyIndex)
else:
word_index = graph_word.index

word_entry = {
ID: word_index,
TEXT: graph_word.word if graph_word.word else None,
Expand Down Expand Up @@ -132,7 +147,13 @@ def build_word_entry(word_index, graph_word):
def convert_response_to_doc(doc, semgrex_response):
doc = copy.deepcopy(doc)
try:
for sent_idx, (sentence, ssurgeon_result) in enumerate(zip(doc.sentences, semgrex_response.result)):
sent_idx = 0
response_idx = 0
while sent_idx < len(doc.sentences):
sentence = doc.sentences[sent_idx]
ssurgeon_result = semgrex_response.result[response_idx]
has_enhanced = sentence.has_enhanced_dependencies()

# EditNode is currently bugged... :/
# TODO: change this after next CoreNLP release (after 4.5.3)
#if not ssurgeon_result.changed:
Expand All @@ -141,7 +162,7 @@ def convert_response_to_doc(doc, semgrex_response):
ssurgeon_graph = ssurgeon_result.graph
tokens = []
for graph_node, graph_word in zip(ssurgeon_graph.node, ssurgeon_graph.token):
word_entry = build_word_entry(graph_node.index, graph_word)
word_entry = build_word_entry(graph_word)
tokens.append(word_entry)
tokens.sort(key=lambda x: x[ID])
for root in ssurgeon_graph.root:
Expand Down Expand Up @@ -207,6 +228,57 @@ def convert_response_to_doc(doc, semgrex_response):
doc.sentences[sent_idx] = sentence

sentence.rebuild_dependencies()

sent_idx += 1
response_idx += 1

if has_enhanced:
enhanced_ssurgeon_graph = semgrex_response.result[response_idx].graph
response_idx += 1

enhanced_words_map = {}
for node_idx, node in enumerate(enhanced_ssurgeon_graph.node):
if node.emptyIndex:
continue
enhanced_words_map[node.index-1] = enhanced_ssurgeon_graph.token[node_idx].word
if any(expected_idx != idx for expected_idx, idx in enumerate(sorted(enhanced_words_map.keys()))):
logger.warning("Sentence %d had gap in indices of the enhanced graph!")
continue
enhanced_words = []
for index, word in sorted(enhanced_words_map.items()):
enhanced_words.append(word)
if (len(sentence.words) != len(enhanced_words) or
any(word.text != enhanced_word for word, enhanced_word in zip(sentence.words, enhanced_words))):
logger.warning("Sentence %d had different words in the enhanced graph compared to the basic graph after running the ssurgeon!", sent_idx)
continue
# yay, the words match at a very basic level
# first need to add any extra words
empty_words = []
for token in enhanced_ssurgeon_graph.token:
if not token.emptyIndex:
continue
word_entry = build_word_entry(token)
empty_words.append(Word(sentence, word_entry))
sentence.empty_words = empty_words

# next add the edges
for edge in enhanced_ssurgeon_graph.edge:
if edge.sourceEmpty:
source = (edge.source, edge.sourceEmpty)
else:
source = edge.source
if edge.targetEmpty:
target = (edge.target, edge.targetEmpty)
else:
target = edge.target
sentence.enhanced_dependencies.add_edge(source, target, edge.dep)
for root in enhanced_ssurgeon_graph.rootNode:
root = enhanced_ssurgeon_graph.node[root]
if root.emptyIndex:
root = (root.index, root.emptyIndex)
else:
root = root.index
sentence.enhanced_dependencies.add_edge(0, root, "root")
except Exception as e:
raise RuntimeError("Ssurgeon could not process sentence {}\nSsurgeon result:\n{}\nOriginal sentence:\n{:C}".format(sent_idx, ssurgeon_result, sentence)) from e
return doc
Expand Down Expand Up @@ -279,7 +351,7 @@ def main():
ssurgeon_edits = [SsurgeonEdit(args.semgrex, args.ssurgeon)]

if args.input_file:
docs = [CoNLL.conll2doc(input_file=args.input_file)]
docs = [CoNLL.conll2doc(input_file=args.input_file, ignore_gapping=False)]
outputs = [args.output_file]
input_output = zip(docs, outputs)
elif args.input_dir:
Expand All @@ -295,7 +367,7 @@ def read_docs():
doc_path = os.path.join(args.input_dir, doc_filename)
output_path = os.path.join(args.output_dir, doc_filename)
print("Processing %s to %s" % (doc_path, output_path))
yield CoNLL.conll2doc(input_file=doc_path), output_path
yield CoNLL.conll2doc(input_file=doc_path, ignore_gapping=False), output_path
input_output = read_docs()
else:
docs = [CoNLL.conll2doc(input_str=SAMPLE_DOC)]
Expand Down
15 changes: 1 addition & 14 deletions stanza/tests/server/test_ssurgeon.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,6 @@ def test_ssurgeon_become_mwt():
6 ” " PUNCT '' _ 4 punct 4:punct _
"""

# TODO: also, we shouldn't lose the enhanced dependencies...
EXISTING_MWT_DOC_EXPECTED = """
# sent_id = newsgroup-groups.google.com_GayMarriage_0ccbb50b41a5830b_ENG_20050321_181500-0005
# text = One of “NCRC4ME’s”
1 One one NUM CD NumType=Card 0 root _ _
2 of of ADP IN _ 4 case _ _
3 “ " PUNCT `` _ 4 punct _ SpaceAfter=No
4-5 NCRC4ME’s _ _ _ _ _ _ _ SpaceAfter=No
4 NCRC4ME NCRC4ME PROPN NNP Number=Sing 1 compound _ _
5 ’s 's PART POS _ 4 case _ _
6 ” " PUNCT '' _ 4 punct _ _
"""

def test_ssurgeon_existing_mwt_no_change():
"""
Test that converting a document with an MWT works as expected
Expand All @@ -176,7 +163,7 @@ def test_ssurgeon_existing_mwt_no_change():
updated_doc = ssurgeon.convert_response_to_doc(doc, ssurgeon_response)

result = "{:C}".format(updated_doc)
compare_ignoring_whitespace(result, EXISTING_MWT_DOC_EXPECTED)
compare_ignoring_whitespace(result, EXISTING_MWT_DOC_INPUT)

def check_empty_test(input_text, expected=None, echo=False):
if expected is None:
Expand Down

0 comments on commit 0f12d49

Please sign in to comment.