Skip to content

Commit

Permalink
added missing relation parts
Browse files Browse the repository at this point in the history
  • Loading branch information
eriktks committed Jan 17, 2025
1 parent 31a737b commit 1ee9c98
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 28 deletions.
58 changes: 51 additions & 7 deletions orangecontrib/storynavigation/modules/purposeanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import storynavigation.modules.constants as constants
import storynavigation.modules.util as util

# to do
# 1 check if previous sentence can be returned as relation argument, sentence: daardoor ging het platteland
# 2 check if different orders need to be applied for different trigger words: omdat vs zodat
# 3 check key errors in output

class PurposeAnalyzer:
"""Class for extracting purpose from texts
Expand All @@ -15,6 +19,9 @@ class PurposeAnalyzer:
"""


PURPOSE_LABELS = ['PURPOSE', 'SCONJ', 'CONTEXT']


def __init__(self, language, story_elements, verb_frames, purpose_strategy, callback=None) -> None:
self.language = language
self.verb_frames = verb_frames
Expand All @@ -24,6 +31,7 @@ def __init__(self, language, story_elements, verb_frames, purpose_strategy, call
entities = self.__process_texts(story_elements_df, callback=callback)
sentence_offsets = self.__compute_sentence_offsets(story_elements_df)
entities_from_onsets = self.__convert_entities(entities, sentence_offsets)
entities_from_onsets = self.__add_missing_relation_parts(entities_from_onsets, sentence_offsets)
self.purpose_analysis = self.__sort_and_filter_results(entities_from_onsets)


Expand All @@ -48,6 +56,40 @@ def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame:
return sentences_df[["storyid", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"])


def __get_missing_label(self, entities_from_onsets, storyid, sentence_id):
labels_found = [entity['label_'] for entity in entities_from_onsets[storyid].values() if entity['sentence_id'] == sentence_id]
return [x for x in self.PURPOSE_LABELS if x not in labels_found]


def __add_missing_relation_part(self, entities_from_onsets, sentence_offsets, storyid, sentence_id):
if sentence_id > 0:
missing_labels = self.__get_missing_label(entities_from_onsets, storyid, sentence_id)
if len(missing_labels) == 1:
char_id_start = sentence_offsets.loc[(storyid, sentence_id - 1)]["char_offset"]
char_id_end = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"] - 1
entities_from_onsets[storyid][char_id_start] = {
'label_': missing_labels[0],
'sentence_id': sentence_id,
'text': (char_id_end - char_id_start) * ['x']
}


def __add_missing_relation_parts(self, entities_from_onsets, sentence_offsets) -> dict:
for storyid in entities_from_onsets:
sentence_ids = {}
for char_id in entities_from_onsets[storyid]:
sentence_id = entities_from_onsets[storyid][char_id]['sentence_id']
label = entities_from_onsets[storyid][char_id]['label_']
if sentence_id in sentence_ids:
sentence_ids[sentence_id].append(label)
else:
sentence_ids[sentence_id] = [label]
for sentence_id in sentence_ids:
if len(sentence_ids[sentence_id]) == 2:
self.__add_missing_relation_part(entities_from_onsets, sentence_offsets, storyid, sentence_id)
return entities_from_onsets


def __convert_entities(self, entities, sentence_offsets) -> dict:
entities_from_onsets = {}
for storyid, sentence_id, sentence_data in entities:
Expand Down Expand Up @@ -92,7 +134,7 @@ def __find_matching_dependencies(self, sentence_df, entity_start_id, head_start_
entity["spacy_lemma"] in verb_frame_verbs))


def __expand_purpose_phrase(self, sentence_df, sentence_entities, entity_start_id, head_start_id, entity_type="PURPOSE") -> None:
def __expand_purpose_phrase(self, sentence_df, sentence_entities, entity_start_id, head_start_id, entity_type="CONTEXT") -> None:
processed_ids = set()
child_entity_ids = self.__get_head_dependencies(sentence_df, entity_start_id, head_start_id)
head_start_id = self.__prepend_tokens_to_purpose_phrase(sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids, entity_type)
Expand All @@ -102,7 +144,7 @@ def __expand_purpose_phrase(self, sentence_df, sentence_entities, entity_start_i
"skipping purpose word", sentence_df[child_entity_id]["spacy_lemma"], sentence_df[child_entity_id]["sentence"])


def __prepend_tokens_to_purpose_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids, entity_type="PURPOSE") -> None:
def __prepend_tokens_to_purpose_phrase(self, sentence_df, sentence_entities, head_start_id, child_entity_ids, processed_ids, entity_type="CONTEXT") -> None:
for child_entity_id in sorted(child_entity_ids, reverse=True):
if child_entity_id in processed_ids:
continue
Expand Down Expand Up @@ -143,6 +185,8 @@ def __process_sentence(self, sentence_dict) -> dict:
if sentence_dict[entity_start_id]["spacy_tag"] == "VERB":
self.__add_sentence_entity_verb(sentence_dict, sentence_entities, entity_start_id)
else:
if head_start_id == head_of_head_start_id:
print("overlapping relation parts!", sentence_dict[head_start_id]["token_text"])
self.__add_sentence_entity_sconj(sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id)
except AttributeError as e:
self.__log_error("attribute error", e, token_data)
Expand All @@ -163,26 +207,26 @@ def __add_sentence_entity_sconj(self, sentence_dict, sentence_entities, entity_s
"sentence_id": sentence_id,
"text": entity["token_text"]}
sentence_entities[head_start_id] = {
"label_": "CAUSE",
"label_": "CONTEXT",
"sentence_id": sentence_id,
"text": sentence_dict[head_start_id]["token_text"]}
sentence_entities[head_of_head_start_id] = {
"label_": "EFFECT",
"label_": "PURPOSE",
"sentence_id": sentence_id,
"text": sentence_dict[head_of_head_start_id]["token_text"]}
self.__expand_purpose_phrase(sentence_dict, sentence_entities, entity_start_id, head_start_id)
if head_of_head_start_id != head_start_id:
self.__expand_purpose_phrase(sentence_dict, sentence_entities, head_start_id, head_of_head_start_id, entity_type="EFFECT")
self.__expand_purpose_phrase(sentence_dict, sentence_entities, head_start_id, head_of_head_start_id, entity_type="PURPOSE")


def __add_sentence_entity_verb(self, sentence_dict, sentence_entities, entity_start_id) -> None:
entity = sentence_dict[entity_start_id]
sentence_id = entity["sentence_id"]
sentence_entities[entity_start_id] = {
"label_": "PURPOSE",
"label_": "CONTEXT",
"sentence_id": sentence_id,
"text": entity["token_text"]}
self.__expand_purpose_phrase(sentence_dict, sentence_entities, entity_start_id, entity_start_id, entity_type="PURPOSE")
self.__expand_purpose_phrase(sentence_dict, sentence_entities, entity_start_id, entity_start_id, entity_type="CONTEXT")


def __get_head_dependencies(self, sentence_df, entity_start_id, head_start_id) -> list:
Expand Down
61 changes: 40 additions & 21 deletions orangecontrib/storynavigation/widgets/OWSNPurposeAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,8 @@ class OWSNPurposeAnalysis(OWWidget, ConcurrentWidgetMixin):
autocommit = Setting(True)
language = 'nl'
n_segments = 1
verb_frames_file_name = os.path.join(
str(constants.PKG),
str(constants.RESOURCES_SUBPACKAGE),
("dutch" if language == "nl" else "english") + "_purpose_verbs.csv")
recent_files = [verb_frames_file_name]
entity_colors = {"PURPOSE": "salmon",
"EFFECT": "lightgreen",
entity_colors = {"CONTEXT": "salmon",
"PURPOSE": "lightgreen",
"SCONJ": "lightblue"}
dlgFormats = (
"All readable files ({});;".format(
Expand All @@ -65,16 +60,32 @@ def __init__(self):
size_policy = QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum)
self.controlArea.setSizePolicy(size_policy)
self.verb_frames = {}
self.purpose_strategy = constants.PURPOSE_STRATEGY_VERBS
self.read_verb_frames_file(self.verb_frames_file_name)

self.__initialize_strategy()
self.read_strategy_file(self.strategy_file_name)

self.__make_language_selection_menu()
self.__make_verb_frames_file_selection_menu()
self.__make_verb_frames_file_dialog()
self.__make_strategy_selection_menu()
self.__make_strategy_file_dialog()
self.__make_regexp_filter_dialog()
self.__make_document_viewer()


def __initialize_strategy(self):
self.sconj_strategy_file_name = os.path.join(
str(constants.PKG),
str(constants.RESOURCES_SUBPACKAGE),
("dutch" if self.language == "nl" else "english") + "_purpose_triggers.csv")
self.verbs_strategy_file_name = os.path.join(
str(constants.PKG),
str(constants.RESOURCES_SUBPACKAGE),
("dutch" if self.language == "nl" else "english") + "_purpose_verbs.csv")
self.recent_strategy_files = [self.verbs_strategy_file_name,
self.sconj_strategy_file_name]
self.strategy_file_name = self.recent_strategy_files[0]
self.purpose_strategy = constants.PURPOSE_STRATEGY_VERBS


def __make_language_selection_menu(self):
self.select_language_combo = gui.comboBox(
widget=self.controlArea,
Expand All @@ -90,7 +101,7 @@ def __make_language_selection_menu(self):
self.select_language_combo.setEnabled(True)


def __make_verb_frames_file_selection_menu(self):
def __make_strategy_selection_menu(self):
self.select_language_combo = gui.comboBox(
widget=self.controlArea,
master=self,
Expand All @@ -106,13 +117,13 @@ def __make_verb_frames_file_selection_menu(self):
self.select_language_combo.setEnabled(True)


def __make_verb_frames_file_dialog(self):
def __make_strategy_file_dialog(self):
# code copied from Corpus widget
fbox = gui.widgetBox(self.controlArea, "Verb frames file:", orientation=0)
self.file_widget = widgets.FileWidget(
recent_files=self.recent_files, icon_size=(16, 16),
on_open=self.read_verb_frames_file, dialog_format=self.dlgFormats,
dialog_title='Choose verb frames file',
recent_files=self.recent_strategy_files, icon_size=(16, 16),
on_open=self.read_strategy_file, dialog_format=self.dlgFormats,
dialog_title='Choose strategy file',
reload_label='Reload', browse_label='Browse',
allow_empty=False, minimal_width=150,
)
Expand Down Expand Up @@ -171,21 +182,29 @@ def refresh_search(self):
self.__visualize_text_data()


def read_verb_frames_file(self, verb_frames_file_name):
self.verb_frames_file_name = verb_frames_file_name
def read_strategy_file(self, strategy_file_name):
self.strategy_file_name = strategy_file_name
self.verb_frames = []
try:
verb_frames_lines = pathlib.Path(verb_frames_file_name).read_text(encoding="utf-8").strip().split("\n")
verb_frames_lines = pathlib.Path(strategy_file_name).read_text(encoding="utf-8").strip().split("\n")
for line in verb_frames_lines:
self.verb_frames.append([token.strip() for token in line.strip().split(",")])
if re.search("verb", strategy_file_name):
self.purpose_strategy = constants.PURPOSE_STRATEGY_VERBS
else:
self.purpose_strategy = constants.PURPOSE_STRATEGY_SCONJ
except Exception as e:
print("read_verb_frames_file", str(e))
print("read_strategy_file", str(e))
if self.story_elements:
self.reset_story_elements(self.story_elements)


def __process_purpose_strategy_change(self):
self.read_verb_frames_file(self.verb_frames_file_name)
if re.search("verb", self.purpose_strategy):
self.strategy_file_name = self.verbs_strategy_file_name
else:
self.strategy_file_name = self.sconj_strategy_file_name
self.read_strategy_file(self.strategy_file_name)


def get_selected_indexes(self) -> Set[int]:
Expand Down

0 comments on commit 1ee9c98

Please sign in to comment.