From c4c3de5803f27843a5050e10ccae71b3fd9c45e9 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Thu, 4 Jan 2024 22:01:52 -0800
Subject: [PATCH] Break ties using proper nouns, if those are part of a
 sentence (eg, pos was added) addresses
 https://github.com/stanfordnlp/stanza/issues/1326

---
 stanza/pipeline/coref_processor.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/stanza/pipeline/coref_processor.py b/stanza/pipeline/coref_processor.py
index 3b89cd60ca..aca18e8668 100644
--- a/stanza/pipeline/coref_processor.py
+++ b/stanza/pipeline/coref_processor.py
@@ -114,12 +114,27 @@ def process(self, document):
 
             # treat the longest span as the representative
             # break ties using the first one
+            # IF there is the POS processor, and it adds upos tags
+            # to the sentence, ties are broken first by maximum
+            # number of UPOS and then earliest in the document
             max_len = 0
             best_span = None
+            max_propn = 0
             for span_idx, span in enumerate(span_cluster):
-                if span[1] - span[0] > max_len:
+                sent_id = sent_ids[span[0]]
+                sentence = sentences[sent_id]
+                start_word = word_pos[span[0]]
+                end_word = word_pos[span[1]]
+                # very UD specific test for most number of proper nouns in a mention
+                # will do nothing if POS is not active (they will all be None)
+                num_propn = sum(word.pos == 'PROPN' for word in sentence.words[start_word:end_word])
+                print(span, num_propn)
+
+                if ((span[1] - span[0] > max_len) or
+                    span[1] - span[0] == max_len and num_propn > max_propn):
                     max_len = span[1] - span[0]
                     best_span = span_idx
+                    max_propn = num_propn
 
             mentions = []
             for span in span_cluster: