From c4c3de5803f27843a5050e10ccae71b3fd9c45e9 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Thu, 4 Jan 2024 22:01:52 -0800 Subject: [PATCH] Break ties using proper nouns, if those are part of a sentence (eg, pos was added) addresses https://github.com/stanfordnlp/stanza/issues/1326 --- stanza/pipeline/coref_processor.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/stanza/pipeline/coref_processor.py b/stanza/pipeline/coref_processor.py index 3b89cd60ca..aca18e8668 100644 --- a/stanza/pipeline/coref_processor.py +++ b/stanza/pipeline/coref_processor.py @@ -114,12 +114,27 @@ def process(self, document): # treat the longest span as the representative # break ties using the first one + # IF there is the POS processor, and it adds upos tags + # to the sentence, ties are broken first by maximum + # number of UPOS and then earliest in the document max_len = 0 best_span = None + max_propn = 0 for span_idx, span in enumerate(span_cluster): - if span[1] - span[0] > max_len: + sent_id = sent_ids[span[0]] + sentence = sentences[sent_id] + start_word = word_pos[span[0]] + end_word = word_pos[span[1]] + # very UD specific test for most number of proper nouns in a mention + # will do nothing if POS is not active (they will all be None) + num_propn = sum(word.pos == 'PROPN' for word in sentence.words[start_word:end_word]) + print(span, num_propn) + + if ((span[1] - span[0] > max_len) or + span[1] - span[0] == max_len and num_propn > max_propn): max_len = span[1] - span[0] best_span = span_idx + max_propn = num_propn mentions = [] for span in span_cluster: