diff --git a/orangecontrib/storynavigation/modules/actoranalysis.py b/orangecontrib/storynavigation/modules/actoranalysis.py index bf50ef3..70d6b7b 100644 --- a/orangecontrib/storynavigation/modules/actoranalysis.py +++ b/orangecontrib/storynavigation/modules/actoranalysis.py @@ -186,6 +186,7 @@ def __postag_sents( ents = [dict(t) for t in unique_tuples] doc = {"text": sentence, "ents": ents} + print('ACTORS.PY', doc) html += displacy.render(doc, style="ent", options=options, manual=True) diff --git a/orangecontrib/storynavigation/modules/meansanalysis.py b/orangecontrib/storynavigation/modules/meansanalysis.py index 0701f79..920a97d 100644 --- a/orangecontrib/storynavigation/modules/meansanalysis.py +++ b/orangecontrib/storynavigation/modules/meansanalysis.py @@ -28,12 +28,12 @@ def __init__(self, language, story_elements, verb_frames, means_strategy, callba def __convert_str_columns_to_ints(self, story_elements_df) -> None: - columns_to_convert = ["storyid", "sentence_id", "token_start_idx", "spacy_head_idx"] + columns_to_convert = ["storyid", "segment_id", "sentence_id", "token_start_idx", "spacy_head_idx"] story_elements_df[columns_to_convert] = story_elements_df[columns_to_convert].astype(int) def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame: - sentences_df = story_elements_df.groupby(["storyid", "sentence_id"]).first().reset_index()[["storyid", "sentence_id", "sentence"]] + sentences_df = story_elements_df.groupby(["storyid", "sentence_id"]).first().reset_index()[["storyid", "segment_id", "sentence_id", "sentence"]] char_offsets = [] last_sentence = "" for sentence_id, sentence in zip(sentences_df["sentence_id"], @@ -45,7 +45,7 @@ def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame: char_offsets.append(char_offset) last_sentence = sentence sentences_df["char_offset"] = char_offsets - return sentences_df[["storyid", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"]) + return sentences_df[["storyid", "segment_id", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"]) def __convert_entities(self, entities, sentence_offsets) -> dict: @@ -113,6 +113,7 @@ def __prepend_tokens_to_means_phrase(self, sentence_df, sentence_entities, head_ in_between_text = " " if entity_gap_size == 1 else ", " sentence_entities[child_entity_id] = { "text": child_entity_text + in_between_text + sentence_entities[head_start_id]["text"], + "segment_id": sentence_df[child_entity_id]["segment_id"], "sentence_id": sentence_df[child_entity_id]["sentence_id"], "label_": "MEANS" } del sentence_entities[head_start_id] @@ -153,17 +154,21 @@ def __process_sentence(self, sentence_dict) -> dict: def __add_sentence_entity(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id) -> None: entity = sentence_dict[entity_start_id] + segment_id = entity["segment_id"] sentence_id = entity["sentence_id"] sentence_entities[entity_start_id] = { "label_": "PREP", + "segment_id": segment_id, "sentence_id": sentence_id, "text": entity["token_text"]} sentence_entities[head_start_id] = { "label_": "MEANS", + "segment_id": segment_id, "sentence_id": sentence_id, "text": sentence_dict[head_start_id]["token_text"]} sentence_entities[head_of_head_start_id] = { "label_": "VERB", + "segment_id": segment_id, "sentence_id": sentence_id, "text": sentence_dict[head_of_head_start_id]["token_text"]} self.__expand_means_phrase(sentence_dict, sentence_entities, entity_start_id, head_start_id) @@ -183,10 +188,10 @@ def __get_head_dependencies(self, sentence_df, entity_start_id, head_start_id) - def __sort_and_filter_results(self, entities) -> pd.DataFrame: - results = [(entity["text"], entity["label_"], storyid, entity["sentence_id"], char_id) + results = [(entity["text"], entity["label_"], storyid, entity["segment_id"], entity["sentence_id"], char_id) for storyid, story_entities in entities.items() for char_id, entity in story_entities.items()] - results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "sentence_id", "character_id"]) + results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "segment_id", "sentence_id", "character_id"]) results_df.sort_values(by=["storyid", "character_id"], inplace=True) - results_df["text_id"] = "ST" + results_df["storyid"].astype(str) - return results_df[["text", "label", "text_id", "sentence_id", "character_id"]].reset_index(drop=True) + results_df["text_id"] = results_df["storyid"].astype(int) + return results_df[["text", "label", "text_id", "segment_id", "sentence_id", "character_id"]].reset_index(drop=True) diff --git a/orangecontrib/storynavigation/modules/purposeanalysis.py b/orangecontrib/storynavigation/modules/purposeanalysis.py index 360541e..06e9f08 100644 --- a/orangecontrib/storynavigation/modules/purposeanalysis.py +++ b/orangecontrib/storynavigation/modules/purposeanalysis.py @@ -1,7 +1,7 @@ import pandas as pd import storynavigation.modules.constants as constants import storynavigation.modules.util as util - +import sys class PurposeAnalyzer: """Class for extracting purpose from texts @@ -36,12 +36,12 @@ def __init__(self, language, story_elements, verb_frames, purpose_strategy, call def __convert_str_columns_to_ints(self, story_elements_df) -> None: - columns_to_convert = ["storyid", "sentence_id", "token_start_idx", "spacy_head_idx"] + columns_to_convert = ["storyid", "sentence_id", "segment_id", "token_start_idx", "spacy_head_idx"] story_elements_df[columns_to_convert] = story_elements_df[columns_to_convert].astype(int) def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame: - sentences_df = story_elements_df.groupby(["storyid", "sentence_id"]).first().reset_index()[["storyid", "sentence_id", "sentence"]] + sentences_df = story_elements_df.groupby(["storyid", "sentence_id"]).first().reset_index()[["storyid", "sentence_id", "segment_id", "sentence"]] char_offsets = [] last_sentence = "" for sentence_id, sentence in zip(sentences_df["sentence_id"], @@ -53,7 +53,7 @@ def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame: char_offsets.append(char_offset) last_sentence = sentence sentences_df["char_offset"] = char_offsets - return sentences_df[["storyid", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"]) + return sentences_df[["storyid", "sentence_id", "segment_id", "char_offset"]].set_index(["storyid", "sentence_id"]) def __get_missing_label(self, entities_from_onsets, storyid, sentence_id) -> list: @@ -61,7 +61,7 @@ def __get_missing_label(self, entities_from_onsets, storyid, sentence_id) -> lis return [x for x in self.PURPOSE_LABELS if x not in labels_found] - def __add_missing_relation_part(self, entities_from_onsets, sentence_offsets, storyid, sentence_id, previous_sentence) -> None: + def __add_missing_relation_part(self, entities_from_onsets, sentence_offsets, storyid, sentence_id, segment_id, previous_sentence) -> None: missing_labels = self.__get_missing_label(entities_from_onsets, storyid, sentence_id) if len(missing_labels) == 1: char_id_start = sentence_offsets.loc[(storyid, sentence_id - 1)]["char_offset"] @@ -69,6 +69,7 @@ def __add_missing_relation_part(self, entities_from_onsets, sentence_offsets, st entities_from_onsets[storyid][char_id_start] = { 'label_': missing_labels[0], 'sentence_id': sentence_id, + 'segment_id': segment_id, 'text': previous_sentence } @@ -79,6 +80,7 @@ def __add_missing_relation_parts(self, story_elements_df, entities_from_onsets, sentence_ids = {} for char_id in entities_from_onsets[storyid]: sentence_id = entities_from_onsets[storyid][char_id]['sentence_id'] + segment_id = entities_from_onsets[storyid][char_id]['segment_id'] label = entities_from_onsets[storyid][char_id]['label_'] if sentence_id in sentence_ids: sentence_ids[sentence_id].append(label) @@ -90,13 +92,14 @@ def __add_missing_relation_parts(self, story_elements_df, entities_from_onsets, sentence_offsets, storyid, sentence_id, + segment_id, sentences_df.loc[storyid, sentence_id - 1]) return entities_from_onsets def __convert_entities(self, entities, sentence_offsets) -> dict: entities_from_onsets = {} - for storyid, sentence_id, sentence_data in entities: + for storyid, sentence_id, segment_id, sentence_data in entities: story_entities = entities_from_onsets.setdefault(storyid, {}) char_offset_sentence = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"] for token_start_id, token_data in sentence_data.items(): @@ -119,6 +122,7 @@ def __process_texts(self, story_elements_df, callback=None) -> list: entities.append([ sentence_dict_index[0], sentence_dict_index[1], + sentence_entities[list(sentence_entities.keys())[0]]["segment_id"], sentence_entities]) if callback: callback((100*(index + 1))/len(sentence_dict)) @@ -167,6 +171,7 @@ def __prepend_tokens_to_purpose_phrase(self, sentence_dict, sentence_entities, h sentence_entities[child_entity_id] = { "text": child_entity_text + in_between_text + sentence_entities[head_start_id]["text"], "sentence_id": sentence_dict[child_entity_id]["sentence_id"], + "segment_id": sentence_dict[child_entity_id]["segment_id"], "label_": sentence_entities[head_start_id]['label_'] } del sentence_entities[head_start_id] head_start_id = child_entity_id @@ -219,15 +224,18 @@ def __log_error(self, error_phrase, e, token_data) -> None: def __add_sentence_entity_adverb(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id) -> None: entity = sentence_dict[entity_start_id] sentence_id = entity["sentence_id"] + segment_id = entity["segment_id"] reversed_order = ([x[2] for x in self.verb_frames if x[1] == entity["token_text"].lower()] == ['yes']) head_label, head_of_head_label = ['PURPOSE', 'CONTEXT'] if reversed_order else ['CONTEXT', 'PURPOSE'] sentence_entities[entity_start_id] = { "label_": "ADVERB", "sentence_id": sentence_id, + "segment_id": segment_id, "text": entity["token_text"]} sentence_entities[head_start_id] = { "label_": head_label, "sentence_id": sentence_id, + "segment_id": segment_id, "text": sentence_dict[head_start_id]["token_text"]} processed_ids = {entity_start_id, head_start_id, head_of_head_start_id} self.__expand_phrase(sentence_dict, @@ -239,6 +247,7 @@ def __add_sentence_entity_adverb(self, sentence_dict, sentence_entities, entity_ sentence_entities[head_of_head_start_id] = { "label_": head_of_head_label, "sentence_id": sentence_id, + "segment_id": segment_id, "text": sentence_dict[head_of_head_start_id]["token_text"]} self.__expand_phrase(sentence_dict, sentence_entities, @@ -250,9 +259,11 @@ def __add_sentence_entity_adverb(self, sentence_dict, sentence_entities, entity_ def __add_sentence_entity_verb(self, sentence_dict, sentence_entities, entity_start_id) -> None: entity = sentence_dict[entity_start_id] sentence_id = entity["sentence_id"] + segment_id = entity["segment_id"] sentence_entities[entity_start_id] = { "label_": "PURPOSE", "sentence_id": sentence_id, + "segment_id": segment_id, "text": entity["token_text"]} self.__expand_phrase(sentence_dict, sentence_entities, entity_start_id, entity_start_id, processed_ids=set()) @@ -267,10 +278,10 @@ def __get_head_dependencies(self, sentence_dict, entity_start_id, head_start_id) def __sort_and_filter_results(self, entities) -> pd.DataFrame: - results = [(entity["text"], entity["label_"], storyid, entity["sentence_id"], char_id) + results = [(entity["text"], entity["label_"], storyid, entity["segment_id"], entity["sentence_id"], char_id) for storyid, story_entities in entities.items() for char_id, entity in story_entities.items()] - results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "sentence_id", "character_id"]) + results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "segment_id", "sentence_id", "character_id"]) results_df.sort_values(by=["storyid", "character_id"], inplace=True) - results_df["text_id"] = "ST" + results_df["storyid"].astype(str) - return results_df[["text", "label", "text_id", "sentence_id", "character_id"]].reset_index(drop=True) + results_df["text_id"] = results_df["storyid"].astype(int) + return results_df[["text", "label", "text_id", "segment_id", "sentence_id", "character_id"]].reset_index(drop=True) diff --git a/orangecontrib/storynavigation/modules/settinganalysis.py b/orangecontrib/storynavigation/modules/settinganalysis.py index 731c0d3..fca21d5 100644 --- a/orangecontrib/storynavigation/modules/settinganalysis.py +++ b/orangecontrib/storynavigation/modules/settinganalysis.py @@ -23,7 +23,7 @@ class SettingAnalyzer: Args: language (str): ISO string of the language of the input text n_segments (int): Number of segments to split each text into - text_tuples (list): binary tuple: text (str) and storyid + text_tuples (list): ternary tuple: text (str), storyid, sentences (list) story_elements (list of lists): tokens with their Spacy analysis callback: function in widget to show the progress of this process """ @@ -54,6 +54,7 @@ def extract_entities_from_table(self, story_elements): story_elements_df = util.convert_orangetable_to_dataframe(story_elements) last_story_id = -1 last_sentence = "" + last_sentence_id = -1 char_offset = 0 entities = [] for index, row in story_elements_df.iterrows(): @@ -65,10 +66,12 @@ def extract_entities_from_table(self, story_elements): last_sentence = "" char_offset = 0 sentence = row["sentence"] - if sentence != last_sentence: + sentence_id = row["sentence_id"] + if sentence_id != last_sentence_id or story_id != last_story_id: if len(last_sentence) > 0: char_offset += 1 + len(last_sentence) last_sentence = sentence + last_sentence_id = sentence_id if row["spacy_ne"] == "O": last_entity_class = "O" last_entity_start_id = -1 @@ -77,7 +80,7 @@ def extract_entities_from_table(self, story_elements): entity_iob = re.sub("-.*$", "", row["spacy_ne"]) entity_start_id = int(row["token_start_idx"]) + char_offset if entity_class != last_entity_class or entity_iob == "B" or story_id != last_story_id: - entities[-1][entity_start_id] = { "text": row["token_text"], "label_": entity_class } + entities[-1][entity_start_id] = {"text": row["token_text"], "label_": entity_class, "sentence_id": row["sentence_id"], "segment_id": row["segment_id"]} last_entity_start_id = entity_start_id last_entity_class = entity_class else: @@ -104,35 +107,53 @@ def __setup_required_nlp_resources(self, language): # self.entity_list = [line.split(",") for line in self.entity_list] def __sort_and_filter_results(self, results): - results = [(x[0], x[1], int(x[2]), x[3], x[4]) for x in results] - results_df = pd.DataFrame(results, columns=["text", "label", "text id", "character id", "location type"]).sort_values(by=["text id", "character id"]) - results_df.insert(3, "storyid", ["ST" + str(text_id) for text_id in results_df["text id"]]) - return results_df[["text", "label", "storyid", "character id", "location type"]].reset_index(drop=True) + results = [(x[0], x[1], int(x[2]), int(x[3]), int(x[4]), x[5], x[6]) for x in results] + results_df = pd.DataFrame(results, columns=["text", "label", "text_id", "sentence_id", "segment_id", "character_id", "location_type"]).sort_values(by=["text_id", "character_id"]) + return results_df[["text", "label", "text_id", "sentence_id", "segment_id", "character_id", "location_type"]].reset_index(drop=True) def __process_texts(self, nlp, text_tuples, entities, callback=None): results = [] index = 0 for entities_per_text in entities: - results.extend(self.__process_text(text_tuples[index][1], text_tuples[index][0], nlp, entities_per_text)) - if callback: - callback((100*(index+1)/len(entities))) + results.extend(self.__process_text(text_tuples[index][1], text_tuples[index][2], nlp, entities_per_text)) index += 1 + if callback: + callback(100*index/len(entities)) return self.__sort_and_filter_results(results) - def __analyze_text_with_list(self, text, nlp, user_defined_entities): + def __analyze_text_with_list(self, sentences, nlp, user_defined_entities): matcher = Matcher(nlp.vocab) for entity_group in self.ENTITY_GROUPS: patterns = [[{"lower": entity_token} for entity_token in entity_text.lower().split()] for entity_text, entity_label in list(user_defined_entities.items()) if entity_label in entity_group] matcher.add(entity_group[0], patterns) - tokens = nlp(text) - return {tokens[m[1]].idx: { + results = dict() + last_story_id = -1 + character_id = 0 + for story_id, sentence_id, segment_id, sentence_text in sentences: + if story_id != last_story_id: + last_story_id = story_id + character_id = 0 + tokens = nlp(sentence_text) + for m in matcher(tokens): + results[character_id + tokens[m[1]].idx] = { "text": " ".join([tokens[token_id].text for token_id in range(m[1], m[2])]), - "label_": nlp.vocab.strings[m[0]] - } for m in matcher(tokens)} + "label_": nlp.vocab.strings[m[0]], + "sentence_id": sentence_id, + "segment_id": segment_id + } + character_id += len(sentence_text) + 1 + return results + +# return {tokens[m[1]].idx: { +# "text": " ".join([tokens[token_id].text for token_id in range(m[1], m[2])]), +# "label_": nlp.vocab.strings[m[0]], +# "sentence_id": list(tokens.sents).index(tokens[m[1]:m[2]].sent), # might fail with duplicate sentences +# "segment_id": 99 # temporary filler +# } for m in matcher(tokens)} def __combine_analyses(self, spacy_analysis, list_analysis): @@ -146,13 +167,13 @@ def __combine_analyses(self, spacy_analysis, list_analysis): def __expand_locations(self, combined_analysis): for start in combined_analysis: - combined_analysis[start]["location type"] = "" + combined_analysis[start]["location_type"] = "" entities_to_add = {} for start in combined_analysis.keys(): if combined_analysis[start]["label_"] in self.LOCATION_LABELS: wikidata_info = self.__get_wikidata_info(combined_analysis[start]["text"]) if len(wikidata_info) > 0 and "description" in wikidata_info[0]: - combined_analysis[start]["location type"] = re.sub("^.* ", "", wikidata_info[0]["description"]) + combined_analysis[start]["location_type"] = re.sub("^.* ", "", wikidata_info[0]["description"]) for start in entities_to_add: combined_analysis[start] = entities_to_add[start] return combined_analysis @@ -170,16 +191,19 @@ def __filter_dates(self, combined_analysis): return combined_analysis - def __process_text(self, text_id, text, nlp, spacy_analysis): - list_analysis = self.__analyze_text_with_list(text, nlp, self.user_defined_entities) + def __process_text(self, text_id, sentences, nlp, spacy_analysis): + first_key = list(spacy_analysis.keys())[0] + list_analysis = self.__analyze_text_with_list(sentences, nlp, self.user_defined_entities) # will not be called if spacy found no entities combined_analysis = self.__combine_analyses(spacy_analysis, list_analysis) combined_analysis = self.__expand_locations(combined_analysis) combined_analysis = self.__filter_dates(combined_analysis) return [(combined_analysis[start]["text"], combined_analysis[start]["label_"], text_id, + combined_analysis[start]["sentence_id"], + combined_analysis[start]["segment_id"], start, - combined_analysis[start]["location type"]) for start in combined_analysis] + combined_analysis[start]["location_type"]) for start in combined_analysis] def __normalize_entities(self, entity_data): @@ -193,20 +217,20 @@ def __normalize_entities(self, entity_data): def __select_earliest_entities(self, entity_data): - counts_series = entity_data[["text", "label", "storyid", "location type"]].value_counts() - counts_df = counts_series.reset_index(name="count").set_index(["text", "label", "storyid"]) + counts_series = entity_data[["text", "label", "text_id", "location_type"]].value_counts() + counts_df = counts_series.reset_index(name="count").set_index(["text", "label", "text_id"]) selected_indices = {} for index, row in entity_data.iterrows(): - key = " ".join([str(row["storyid"]), row["label"]]) + key = " ".join([str(row["text_id"]), row["label"]]) if (key not in selected_indices.keys() and (row["label"] not in self.LOCATION_LABELS or (re.search("^[A-Z]", row["text"]) and - re.search("^[A-Z]", row["location type"])))): + re.search("^[A-Z]", row["location_type"])))): selected_indices[key] = [row["text"], row["label"], - row["storyid"], - row["location type"], - counts_df.loc[row["text"], row["label"], row["storyid"]]["count"]] + row["text_id"], + row["location_type"], + counts_df.loc[row["text"], row["label"], row["text_id"]]["count"]] return [list(x)[:4] for x in selected_indices.values()] @@ -216,8 +240,8 @@ def __lookup_selected_values(self, entity_data, selected_values): try: selected_values_index = selected_values.index([row["text"], row["label"], - row["storyid"], - row["location type"]]) + row["text_id"], + row["location_type"]]) selected_column[entity_data_index] = "selected" selected_values.pop(selected_values_index) except: diff --git a/orangecontrib/storynavigation/resources/dutch_entities.csv b/orangecontrib/storynavigation/resources/dutch_entities.csv index 7166d21..44163f4 100644 --- a/orangecontrib/storynavigation/resources/dutch_entities.csv +++ b/orangecontrib/storynavigation/resources/dutch_entities.csv @@ -2,3 +2,4 @@ EVENT,coronacrisis EVENT,corona-crisis EVENT,corona crisis EVENT,Koningsdag +EVENT,winter diff --git a/orangecontrib/storynavigation/resources/dutch_verb_frames.csv b/orangecontrib/storynavigation/resources/dutch_verb_frames.csv index 17e73b0..ada0d78 100644 --- a/orangecontrib/storynavigation/resources/dutch_verb_frames.csv +++ b/orangecontrib/storynavigation/resources/dutch_verb_frames.csv @@ -1,8 +1,32 @@ verb,preposition bewaakt,met +bewaken,met doen,via zijn,doordat gaan,doordat worden,omdat hebben,door slaan,met +prikken,met +snijden,met +wieschen,met +gooien,met +lopen,met +pakken,met +kloppen,met +uithalen,met +rijden,met +stampen,met +besturen,met +snoeren,met +aanraken,met +pakken,met +genezen,met +vanééngereten,door +roeren,met +roepen,met +grijpen,met +genezen,met +dreigen,met +afdrogen,met +kammen,met diff --git a/orangecontrib/storynavigation/widgets/OWSNPurposeAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNPurposeAnalysis.py index fba6603..f986df8 100644 --- a/orangecontrib/storynavigation/widgets/OWSNPurposeAnalysis.py +++ b/orangecontrib/storynavigation/widgets/OWSNPurposeAnalysis.py @@ -318,7 +318,7 @@ def __add_entity_colors_to_story_text(self, story_text, story_id): first_id = sys.maxsize try: for index, row in self.analyzer.purpose_analysis.loc[ - self.analyzer.purpose_analysis["text_id"] == "ST" + str(story_id)].iloc[::-1].iterrows(): + self.analyzer.purpose_analysis["text_id"] == story_id].iloc[::-1].iterrows(): start = int(row["character_id"]) end = start + len(row["text"]) if end >= first_id: @@ -341,7 +341,7 @@ def __visualize_text_data(self): html_text += self.__make_entity_bar_for_html() for story_text, story_id in self.text_tuples: if len(self.stories_selected) == 0 or int(story_id) in self.stories_selected: - story_text = self.__add_entity_colors_to_story_text(story_text, story_id) + story_text = self.__add_entity_colors_to_story_text(story_text, int(story_id)) html_text += "