From 8dbd8603ad03c563e467dcb8128ce462f5a4eae7 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 31 Mar 2023 11:24:10 +0200 Subject: [PATCH 01/53] code from notebook --- scikit-talk/turndynamics.py | 286 ++++++++++++++++++++++++++++++++++++ tests/test_turndynamics.py | 16 ++ 2 files changed, 302 insertions(+) create mode 100644 scikit-talk/turndynamics.py create mode 100644 tests/test_turndynamics.py diff --git a/scikit-talk/turndynamics.py b/scikit-talk/turndynamics.py new file mode 100644 index 0000000..9db5d17 --- /dev/null +++ b/scikit-talk/turndynamics.py @@ -0,0 +1,286 @@ +import os +import glob +import math +import re +import datetime +import grapheme +import numpy as np +import pandas as pd +from collections import Counter + +from tqdm.autonotebook import tqdm +from joblib import Parallel, delayed + +def readcorpus (filename, langshort = None, langfull = None): + """ returns a formatted language corpus with turn and transition measures + + :param a: filename of the corpus + :type a: string + :param b: short version of language name, defaults to none + :type b: string, optional + :param c: full version of language name, defaults to none + :type c: string, optional + + :return: formatted dataframe of the language corpus + """ + # convert time strings to the ISO 8601 time format hh:mm:ss.sss + def _converttime(text): + if pd.isna(text) == True: + return pd.NA + else: + h,m,s = text.split(':') + return int(datetime.timedelta(hours=int(h), + minutes=int(m), + seconds=float(s)).total_seconds()*1000) + + # number of unique sources + def _getsourceindex(source): + return n_sources.index(source) + 1 + + # talk, laugh, breath, or other conduct classification + def _getnature(utterance): + if pd.isna(utterance) == True: + return pd.NA + if utterance == '[laugh]': + return 'laugh' + if utterance == '[breath]': + return 'breath' + if utterance in ['[cough]', '[sneeze]', '[nod]', '[blow]', '[sigh]', + '[yawn]', '[sniff]', '[clearsthroat]', + '[lipsmack]', '[inhales]', '[groan]']: + return utterance + else: + return 'talk' + + # count number of characters + def _getnchar(utterance): + if pd.isna(utterance) == True: + return pd.NA + else: + utterance = Counter(utterance.replace(" ","")) + return sum(utterance.values()) + + # create a 'window' for each utterance + # The window looks at 10s prior the begin of the current utterance (lookback) + # Only turns that begin within this lookback are included + # in the window. This means that if the prior turn began later + # than 10s before the current utterance, then the prior turn is + # not included in the window. + def _createwindow(begin, participant): + lookback = 10000 + lookfwd = 0 + filter = (df_transitions['begin'] >= (begin - lookback)) & (df_transitions['begin'] <= (begin + lookfwd)) + window = df_transitions.loc[filter] + # identify who produced the utterance + window['turnby'] = np.where(window['participant'] == participant, 'self', + 'other') + # calculate duration of all turns in window + stretch = window['end'].max() - window['begin'].min() + # calculate sum of all turn durations + talk_all = window['duration'].sum() + # calculate amount of talk produced by the participant in relation + # to the total amount of talk in the window + try: + talk_rel = window.loc[window['turnby'] == 'self']['duration'].sum() / talk_all + except ZeroDivisionError: + talk_rel = pd.NA + # calculate amount of loading of the channel + # (1 = no empty space > overlap, < silences) + load = talk_all / stretch + # calculate total amount of turns in this time window + turns_all = len(window.index) + # calculate amount of turns by this participant relative to turns by others + try: + turns_rel = (len(window[window['turnby'] == 'self'].index)) / turns_all + except ZeroDivisionError: + turns_rel = pd.NA + + participants = window['participant'].nunique() + # create list of all measures computed + measures = [talk_all, talk_rel, load, turns_all, turns_rel, participants] + return measures + + df = pd.read_csv(filename) + filename = re.sub('.csv', "", filename) + filename = re.sub('\.\/ElPaCo Dataset\/', '', filename) + #filename = re.sub('ElPaCo dataset\/', '', filename) + df['language'] = re.sub("[0-9]", "", filename) + if langshort is not None: + df['langshort'] = langshort + else: + df['langshort'] = df['language'] + if langfull is not None: + df['langfull'] = langfull + else: + df['langfull'] = df['language'] + df['corpus'] = filename + df['begin'] = df['begin'].apply(_converttime) + df['end'] = df['end'].apply(_converttime) + + # calculate duration of the turn + df['duration'] = df['end'] - df['begin'] + + # define improbably long (more than 40 seconds) and negative durations + n_weird_durations = df.loc[((df['duration'] > 40000) | (df['duration'] < 0))] + + # set weird durations to NA under the ff columns: begin, end, and duration + df.loc[(df['duration'] > 40000) | (df['duration'] < 0), ['duration']] = pd.NA + df.loc[(df['duration'] > 40000) | (df['duration'] < 0), ['end']] = pd.NA + df.loc[(df['duration'] > 40000) | (df['duration'] < 0), ['begin']] = pd.NA + + # create UID + + # list of unique sources in the corpus + n_sources = df['source'].unique().tolist() + # length of the number of sources (i.e. 20 sources = 2 chars), for padding + x = len(str(len(n_sources))) + # length of the number of turns in a source + # (i.e. 100 conversations = 3 chars), for padding + y = len(str(len(df.groupby(['source', 'utterance']).size()))) + + # UID format: language-source number-turn number (within a source) + uidbegin = np.where(pd.isna(df['begin']) == True, 'NA', df['begin'].astype(str)) + df['uid'] = df['language'] + '-' + (df['source'].apply(_getsourceindex)).astype(str).str.zfill(x) + '-' + (df.groupby(['source']).cumcount() + 1).astype(str).str.zfill(y) + '-' + uidbegin + + # deal with "unknown utterance" content + na_strings = ['[unk_utterance', '[unk_noise]', '[distortion]', + '[background]', '[background] M', '[static]', 'untranscribed', + '[noise]', '[inintel]', '[distorted]', 'tlyam kanəw'] + + # set unknown utterances to NA + df.loc[(df['utterance'].isin(na_strings)), ['utterance']] = pd.NA + n_unknown = df['utterance'][df['utterance'].isin(na_strings)].count() + + # get nature of utterance + df['nature'] = df['utterance'].apply(_getnature) + + # create a stripped version of the utterance + df['utterance_stripped'] = df['utterance'].str.strip() + df['utterance_stripped'] = df['utterance_stripped'].str.replace(r'\[[^[]*\]', + '', regex=True) + df['utterance_stripped'] = df['utterance_stripped'].str.replace(r'[\\(\\)]+', + '', regex=True) + # set blank utterances to NA + df.loc[df['utterance_stripped'] == '', 'utterance_stripped'] = pd.NA + + # measure number of words by counting spaces + df['nwords'] = df['utterance_stripped'].str.count(' ') + 1 + + # measure number of characters + df['nchar'] = df['utterance_stripped'].apply(_getnchar)#.astype(float) + + # add turn and frequency rank measures + + # create a new dataframe without NA utterances (for easier calculations) + df_ranking = df.dropna(subset=['utterance_stripped']) + # count how frequent the utterance occurs in the corpus + df_ranking['n'] = df_ranking.groupby('utterance')['utterance'].transform('count').astype(float) + # rank the frequency of the utterance + df_ranking['rank'] = df_ranking['n'].rank(method='dense', ascending=False) + # calculate total number of uttrances + df_ranking['total'] = df_ranking['n'].sum() + # calculate frequency of utterance in relation to the total number of utterances + df_ranking['frequency'] = df_ranking['n'] / df_ranking['total'] + # merge the new dataframe with the original dataframe + df = pd.merge(df, df_ranking) + + # categorize overlap, look at overlap with turns up to four positions down + # overlap can either be full or partial + # set to NA if no overlap is found + df['overlap'] = np.where((df['begin'] > df['begin'].shift(1)) & (df['end'] < df['end'].shift(1)) | + (df['begin'] > df['begin'].shift(2)) & (df['end'] < df['end'].shift(2)) | + (df['begin'] > df['begin'].shift(3)) & (df['end'] < df['end'].shift(3)) | + (df['begin'] > df['begin'].shift(4)) & (df['end'] < df['end'].shift(4)), + 'full', np.where((df['begin'] > df['begin'].shift()) & (df['begin'] <= df['end'].shift()), + 'partial', pd.NA)) + + # identify who produced the prior utterance: other, self, + # or self during other (if previous utterance by the same participant + # was fully overlapped by an utterance of a different pariticpant) + # the priorby of the first utterance in the corpus is set to NA + df['priorby'] = np.where(df['participant'].index == 0, pd.NA, + np.where(df['participant'] != df['participant'].shift(), + 'other', np.where((df['overlap'].shift() == 'full') & + (df['participant'].shift() == df['participant']), + 'self_during_other', 'self' + ))) + + # calculate FTO (Flow Time Overlap) + # This refers to the duration of the overlap between the current utterance + # and the most relevant prior turn by other, which is not necessatily the + # prior row in the df. By default we only get 0, 1 and 5 right. Cases 2 + # and 3 are covered by a rule that looks at turns coming in early for which + # prior turn is by self but T-2 is by other. Some cases of 4 (but not all is + # covered by looking for turns that do not come in early but have a prior + # turn in overlap and look for the turn at T-2 by a different participant. + + # A turn doesn't receive an FTO if it follows a row in the db that doesn't + # have timing information, or if it is such a row. + + # A [------------------] [0--] + # B [1-] [2--] [3--] [4--] [5--] + + df['FTO'] = np.where((df['priorby'] == 'other') & (df['begin'] - df['begin'].shift() < 200) & + (df['priorby'].shift() != 'other'), df['begin'] - df['end'].shift(2), + np.where((df['priorby'] == 'other') & + (df['begin'] - df['begin'].shift() < 200) & + (df['priorby'].shift() != 'self') & + df['priorby'].shift(2) == 'other', + df['begin'] - df['end'].shift(3), + np.where((df['priorby'] == 'self_during_other') & + (df['participant'].shift(2) != df['participant']), + df['begin'] - df['end'].shift(2), + np.where((df['priorby'] == 'self_during_other') & + (df['priorby'].shift() == 'self_during_other'), + df['begin'] - df['end'].shift(3), + np.where(df['priorby'] == 'other', + df['begin'] - df['end'].shift(), + np.where(df['priorby'] == 'self', pd.NA, pd.NA + )))))) + + # identify whther a turn is overlapped by what succeeds it + # if not, set to NA + df['overlapped'] = np.where((df['begin'] < df['begin'].shift(-1)) & + (df['end'] > df['begin'].shift(-1)),'overlapped', pd.NA) + + # set FTO to NA if it is higher than 10s or lower than -10s, on the + # grounds that (a) psycholinguistically it is implausible that these + # actually relate to the end of the 'prior', and (b) conversation + # analytically it is necessary to treat such cases on their + # own terms rather than take an FTO at face value + + df['FTO'] = np.where(df['FTO'] > 9999, pd.NA, np.where(df['FTO'] < -9999, pd.NA, df['FTO'])) + # set FTO to NA if it is negative < -99999, on the + # grounds that (a) psycholinguistically it is + # impossible to relate to the end of the 'prior' turn, + # and (b) conversation analytically it is necessary + # to treat such cases on their own terms rather than + # take an FTO at face value + + + # add transitions metadata + + # create new dataframe with only the relevant columns + df_transitions = df.copy() + df_transitions = df_transitions.drop(columns=['langshort', 'langfull', + 'corpus', 'nature', + 'utterance_stripped', + 'nwords', 'nchar', 'n', + 'rank', 'total', + 'frequency', 'overlap']) + + # put all the calculated transition measures into one column + df['transitions'] = df.apply(lambda x: _createwindow(x['begin'], + x['participant']), + axis=1) + + # split the list into six columns, one column representing each measure + df_split = pd.DataFrame(df['transitions'].tolist(), columns=['talk_all', 'talk_rel', 'load', + 'turns_all','turns_rel', 'participants']) + + # add transition measures to original df + df = pd.concat([df, df_split], axis=1) + # drop column containing list of transition measures + df = df.drop(columns='transitions') + + return df \ No newline at end of file diff --git a/tests/test_turndynamics.py b/tests/test_turndynamics.py new file mode 100644 index 0000000..ec05d1b --- /dev/null +++ b/tests/test_turndynamics.py @@ -0,0 +1,16 @@ +corpora_list = [] + +metadatafile = pd.read_csv("_overview.csv", encoding="ISO-8859-1", sep=',') +corpora_for_d_latest = pd.DataFrame(columns=['corpus_path','langshort','langfull']) +# loop over csv files (language corpus) +for index, row in metadatafile.iterrows(): + if row["ElPaCo_included"] == "yes": + corpus_name = row["File_name"] + corpus_path = './Elpaco dataset/'+corpus_name+'.csv' + langshort = row["langshort"] + langfull = row["Langfull"] + corpora_list.append([corpus_path, langshort,langfull]) + +corpora_for_d_latest = pd.DataFrame(corpora_list, columns = ['language', 'langshort', 'langfull']) + +print(corpora_for_d_latest) \ No newline at end of file From 214a7b3f30da0b7fb30dd00a8ba98bd5fb1b80d6 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 7 Apr 2023 19:24:31 +0200 Subject: [PATCH 02/53] add turndynamics code from notebook --- sktalk/turndynamics.py | 303 +++++++++++++++++++++++++++++++++++++ tests/test_turndynamics.py | 16 ++ 2 files changed, 319 insertions(+) create mode 100644 sktalk/turndynamics.py create mode 100644 tests/test_turndynamics.py diff --git a/sktalk/turndynamics.py b/sktalk/turndynamics.py new file mode 100644 index 0000000..5d4252e --- /dev/null +++ b/sktalk/turndynamics.py @@ -0,0 +1,303 @@ +import os +import glob +import math +import re +import datetime +import grapheme +import numpy as np +import pandas as pd +from collections import Counter + +from tqdm.autonotebook import tqdm +from joblib import Parallel, delayed + + +def readcorpus(filename, langshort=None, langfull=None): + """ returns a formatted language corpus with turn and transition measures + + :param a: filename of the corpus + :type a: string + :param b: short version of language name, defaults to none + :type b: string, optional + :param c: full version of language name, defaults to none + :type c: string, optional + + :return: formatted dataframe of the language corpus + """ + # convert time strings to the ISO 8601 time format hh:mm:ss.sss + def _converttime(text): + if pd.isna(text) == True: + return pd.NA + else: + h, m, s = text.split(':') + return int(datetime.timedelta(hours=int(h), + minutes=int(m), + seconds=float(s)).total_seconds()*1000) + + # number of unique sources + def _getsourceindex(source): + return n_sources.index(source) + 1 + + # talk, laugh, breath, or other conduct classification + def _getnature(utterance): + if pd.isna(utterance) == True: + return pd.NA + if utterance == '[laugh]': + return 'laugh' + if utterance == '[breath]': + return 'breath' + if utterance in ['[cough]', '[sneeze]', '[nod]', '[blow]', '[sigh]', + '[yawn]', '[sniff]', '[clearsthroat]', + '[lipsmack]', '[inhales]', '[groan]']: + return utterance + else: + return 'talk' + + # count number of characters + def _getnchar(utterance): + if pd.isna(utterance) == True: + return pd.NA + else: + utterance = Counter(utterance.replace(" ", "")) + return sum(utterance.values()) + + # create a 'window' for each utterance + # The window looks at 10s prior the begin of the current utterance (lookback) + # Only turns that begin within this lookback are included + # in the window. This means that if the prior turn began later + # than 10s before the current utterance, then the prior turn is + # not included in the window. + def _createwindow(begin, participant): + lookback = 10000 + lookfwd = 0 + filter = (df_transitions['begin'] >= ( + begin - lookback)) & (df_transitions['begin'] <= (begin + lookfwd)) + window = df_transitions.loc[filter] + # identify who produced the utterance + window['turnby'] = np.where(window['participant'] == participant, 'self', + 'other') + # calculate duration of all turns in window + stretch = window['end'].max() - window['begin'].min() + # calculate sum of all turn durations + talk_all = window['duration'].sum() + # calculate amount of talk produced by the participant in relation + # to the total amount of talk in the window + try: + talk_rel = window.loc[window['turnby'] == + 'self']['duration'].sum() / talk_all + except ZeroDivisionError: + talk_rel = pd.NA + # calculate amount of loading of the channel + # (1 = no empty space > overlap, < silences) + load = talk_all / stretch + # calculate total amount of turns in this time window + turns_all = len(window.index) + # calculate amount of turns by this participant relative to turns by others + try: + turns_rel = ( + len(window[window['turnby'] == 'self'].index)) / turns_all + except ZeroDivisionError: + turns_rel = pd.NA + + participants = window['participant'].nunique() + # create list of all measures computed + measures = [talk_all, talk_rel, load, + turns_all, turns_rel, participants] + return measures + + df = pd.read_csv(filename) + filename = re.sub('.csv', "", filename) + filename = re.sub('\.\/ElPaCo Dataset\/', '', filename) + # filename = re.sub('ElPaCo dataset\/', '', filename) + df['language'] = re.sub("[0-9]", "", filename) + if langshort is not None: + df['langshort'] = langshort + else: + df['langshort'] = df['language'] + if langfull is not None: + df['langfull'] = langfull + else: + df['langfull'] = df['language'] + df['corpus'] = filename + df['begin'] = df['begin'].apply(_converttime) + df['end'] = df['end'].apply(_converttime) + + # calculate duration of the turn + df['duration'] = df['end'] - df['begin'] + + # define improbably long (more than 40 seconds) and negative durations + n_weird_durations = df.loc[( + (df['duration'] > 40000) | (df['duration'] < 0))] + + # set weird durations to NA under the ff columns: begin, end, and duration + df.loc[(df['duration'] > 40000) | ( + df['duration'] < 0), ['duration']] = pd.NA + df.loc[(df['duration'] > 40000) | (df['duration'] < 0), ['end']] = pd.NA + df.loc[(df['duration'] > 40000) | (df['duration'] < 0), ['begin']] = pd.NA + + # create UID + + # list of unique sources in the corpus + n_sources = df['source'].unique().tolist() + # length of the number of sources (i.e. 20 sources = 2 chars), for padding + x = len(str(len(n_sources))) + # length of the number of turns in a source + # (i.e. 100 conversations = 3 chars), for padding + y = len(str(len(df.groupby(['source', 'utterance']).size()))) + + # UID format: language-source number-turn number (within a source) + uidbegin = np.where(pd.isna(df['begin']) == + True, 'NA', df['begin'].astype(str)) + df['uid'] = df['language'] + '-' + (df['source'].apply(_getsourceindex)).astype(str).str.zfill( + x) + '-' + (df.groupby(['source']).cumcount() + 1).astype(str).str.zfill(y) + '-' + uidbegin + + # deal with "unknown utterance" content + na_strings = ['[unk_utterance', '[unk_noise]', '[distortion]', + '[background]', '[background] M', '[static]', 'untranscribed', + '[noise]', '[inintel]', '[distorted]', 'tlyam kanəw'] + + # set unknown utterances to NA + df.loc[(df['utterance'].isin(na_strings)), ['utterance']] = pd.NA + n_unknown = df['utterance'][df['utterance'].isin(na_strings)].count() + + # get nature of utterance + df['nature'] = df['utterance'].apply(_getnature) + + # create a stripped version of the utterance + df['utterance_stripped'] = df['utterance'].str.strip() + df['utterance_stripped'] = df['utterance_stripped'].str.replace(r'\[[^[]*\]', + '', regex=True) + df['utterance_stripped'] = df['utterance_stripped'].str.replace(r'[\\(\\)]+', + '', regex=True) + # set blank utterances to NA + df.loc[df['utterance_stripped'] == '', 'utterance_stripped'] = pd.NA + + # measure number of words by counting spaces + df['nwords'] = df['utterance_stripped'].str.count(' ') + 1 + + # measure number of characters + df['nchar'] = df['utterance_stripped'].apply(_getnchar) # .astype(float) + + # add turn and frequency rank measures + + # create a new dataframe without NA utterances (for easier calculations) + df_ranking = df.dropna(subset=['utterance_stripped']) + # count how frequent the utterance occurs in the corpus + df_ranking['n'] = df_ranking.groupby( + 'utterance')['utterance'].transform('count').astype(float) + # rank the frequency of the utterance + df_ranking['rank'] = df_ranking['n'].rank(method='dense', ascending=False) + # calculate total number of uttrances + df_ranking['total'] = df_ranking['n'].sum() + # calculate frequency of utterance in relation to the total number of utterances + df_ranking['frequency'] = df_ranking['n'] / df_ranking['total'] + # merge the new dataframe with the original dataframe + df = pd.merge(df, df_ranking) + + # categorize overlap, look at overlap with turns up to four positions down + # overlap can either be full or partial + # set to NA if no overlap is found + df['overlap'] = np.where((df['begin'] > df['begin'].shift(1)) & (df['end'] < df['end'].shift(1)) | + (df['begin'] > df['begin'].shift(2)) & (df['end'] < df['end'].shift(2)) | + (df['begin'] > df['begin'].shift(3)) & (df['end'] < df['end'].shift(3)) | + (df['begin'] > df['begin'].shift(4)) & ( + df['end'] < df['end'].shift(4)), + 'full', np.where((df['begin'] > df['begin'].shift()) & (df['begin'] <= df['end'].shift()), + 'partial', pd.NA)) + + # identify who produced the prior utterance: other, self, + # or self during other (if previous utterance by the same participant + # was fully overlapped by an utterance of a different pariticpant) + # the priorby of the first utterance in the corpus is set to NA + df['priorby'] = np.where(df['participant'].index == 0, pd.NA, + np.where(df['participant'] != df['participant'].shift(), + 'other', np.where((df['overlap'].shift() == 'full') & + (df['participant'].shift( + ) == df['participant']), + 'self_during_other', 'self' + ))) + + # calculate FTO (Flow Time Overlap) + # This refers to the duration of the overlap between the current utterance + # and the most relevant prior turn by other, which is not necessatily the + # prior row in the df. By default we only get 0, 1 and 5 right. Cases 2 + # and 3 are covered by a rule that looks at turns coming in early for which + # prior turn is by self but T-2 is by other. Some cases of 4 (but not all is + # covered by looking for turns that do not come in early but have a prior + # turn in overlap and look for the turn at T-2 by a different participant. + + # A turn doesn't receive an FTO if it follows a row in the db that doesn't + # have timing information, or if it is such a row. + + # A [------------------] [0--] + # B [1-] [2--] [3--] [4--] [5--] + + df['FTO'] = np.where((df['priorby'] == 'other') & (df['begin'] - df['begin'].shift() < 200) & + (df['priorby'].shift() != 'other'), df['begin'] - + df['end'].shift(2), + np.where((df['priorby'] == 'other') & + (df['begin'] - df['begin'].shift() < 200) & + (df['priorby'].shift() != 'self') & + df['priorby'].shift(2) == 'other', + df['begin'] - df['end'].shift(3), + np.where((df['priorby'] == 'self_during_other') & + (df['participant'].shift( + 2) != df['participant']), + df['begin'] - df['end'].shift(2), + np.where((df['priorby'] == 'self_during_other') & + (df['priorby'].shift() + == 'self_during_other'), + df['begin'] - + df['end'].shift(3), + np.where(df['priorby'] == 'other', + df['begin'] - + df['end'].shift(), + np.where(df['priorby'] == 'self', pd.NA, pd.NA + )))))) + + # identify whther a turn is overlapped by what succeeds it + # if not, set to NA + df['overlapped'] = np.where((df['begin'] < df['begin'].shift(-1)) & + (df['end'] > df['begin'].shift(-1)), 'overlapped', pd.NA) + + # set FTO to NA if it is higher than 10s or lower than -10s, on the + # grounds that (a) psycholinguistically it is implausible that these + # actually relate to the end of the 'prior', and (b) conversation + # analytically it is necessary to treat such cases on their + # own terms rather than take an FTO at face value + + df['FTO'] = np.where(df['FTO'] > 9999, pd.NA, np.where( + df['FTO'] < -9999, pd.NA, df['FTO'])) + # set FTO to NA if it is negative < -99999, on the + # grounds that (a) psycholinguistically it is + # impossible to relate to the end of the 'prior' turn, + # and (b) conversation analytically it is necessary + # to treat such cases on their own terms rather than + # take an FTO at face value + + # add transitions metadata + + # create new dataframe with only the relevant columns + df_transitions = df.copy() + df_transitions = df_transitions.drop(columns=['langshort', 'langfull', + 'corpus', 'nature', + 'utterance_stripped', + 'nwords', 'nchar', 'n', + 'rank', 'total', + 'frequency', 'overlap']) + + # put all the calculated transition measures into one column + df['transitions'] = df.apply(lambda x: _createwindow(x['begin'], + x['participant']), + axis=1) + + # split the list into six columns, one column representing each measure + df_split = pd.DataFrame(df['transitions'].tolist(), columns=['talk_all', 'talk_rel', 'load', + 'turns_all', 'turns_rel', 'participants']) + + # add transition measures to original df + df = pd.concat([df, df_split], axis=1) + # drop column containing list of transition measures + df = df.drop(columns='transitions') + + return df diff --git a/tests/test_turndynamics.py b/tests/test_turndynamics.py new file mode 100644 index 0000000..ec05d1b --- /dev/null +++ b/tests/test_turndynamics.py @@ -0,0 +1,16 @@ +corpora_list = [] + +metadatafile = pd.read_csv("_overview.csv", encoding="ISO-8859-1", sep=',') +corpora_for_d_latest = pd.DataFrame(columns=['corpus_path','langshort','langfull']) +# loop over csv files (language corpus) +for index, row in metadatafile.iterrows(): + if row["ElPaCo_included"] == "yes": + corpus_name = row["File_name"] + corpus_path = './Elpaco dataset/'+corpus_name+'.csv' + langshort = row["langshort"] + langfull = row["Langfull"] + corpora_list.append([corpus_path, langshort,langfull]) + +corpora_for_d_latest = pd.DataFrame(corpora_list, columns = ['language', 'langshort', 'langfull']) + +print(corpora_for_d_latest) \ No newline at end of file From 2d1a6f0ac21a33b92b87810d58f3da50949303a2 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 7 Apr 2023 19:32:55 +0200 Subject: [PATCH 03/53] instructions for disabling the bloody githook --- .githooks/pre-commit | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.githooks/pre-commit b/.githooks/pre-commit index d238978..0c277cf 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -2,6 +2,8 @@ ### To enable this githook, run: ### git config --local core.hooksPath .githooks +### to disable: +### git config --unset core.hooksPath echo "Script $0 triggered ..." From d2b11b054750f71b5eaf4dbd002a12a28b81b572 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 7 Apr 2023 19:34:04 +0200 Subject: [PATCH 04/53] move code from test to notebook --- notebooks/turntaking.ipynb | 69 ++++++++++++++++++++++++++++++++++++++ tests/test_turndynamics.py | 16 --------- 2 files changed, 69 insertions(+), 16 deletions(-) create mode 100644 notebooks/turntaking.ipynb diff --git a/notebooks/turntaking.ipynb b/notebooks/turntaking.ipynb new file mode 100644 index 0000000..4afe370 --- /dev/null +++ b/notebooks/turntaking.ipynb @@ -0,0 +1,69 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# to ensure that the modules can be imported, as they are located in a different folder, add the package root to the path:\n", + "\n", + "import sys\n", + "sys.path.insert(0, \"../\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sktalk.turndynamics as td" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corpora_list = []\n", + "\n", + "metadatafile = pd.read_csv(\"_overview.csv\", encoding=\"ISO-8859-1\", sep=',')\n", + "corpora_for_d_latest = pd.DataFrame(columns=['corpus_path','langshort','langfull'])\n", + "# loop over csv files (language corpus)\n", + "for index, row in metadatafile.iterrows():\n", + " if row[\"ElPaCo_included\"] == \"yes\":\n", + " corpus_name = row[\"File_name\"]\n", + " corpus_path = './Elpaco dataset/'+corpus_name+'.csv'\n", + " langshort = row[\"langshort\"]\n", + " langfull = row[\"Langfull\"]\n", + " corpora_list.append([corpus_path, langshort,langfull])\n", + "\n", + "corpora_for_d_latest = pd.DataFrame(corpora_list, columns = ['language', 'langshort', 'langfull'])\n", + "\n", + "print(corpora_for_d_latest)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_turndynamics.py b/tests/test_turndynamics.py index ec05d1b..e69de29 100644 --- a/tests/test_turndynamics.py +++ b/tests/test_turndynamics.py @@ -1,16 +0,0 @@ -corpora_list = [] - -metadatafile = pd.read_csv("_overview.csv", encoding="ISO-8859-1", sep=',') -corpora_for_d_latest = pd.DataFrame(columns=['corpus_path','langshort','langfull']) -# loop over csv files (language corpus) -for index, row in metadatafile.iterrows(): - if row["ElPaCo_included"] == "yes": - corpus_name = row["File_name"] - corpus_path = './Elpaco dataset/'+corpus_name+'.csv' - langshort = row["langshort"] - langfull = row["Langfull"] - corpora_list.append([corpus_path, langshort,langfull]) - -corpora_for_d_latest = pd.DataFrame(corpora_list, columns = ['language', 'langshort', 'langfull']) - -print(corpora_for_d_latest) \ No newline at end of file From da8ef4dc4a61d354db4291634cd1eec81ad64dda Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Thu, 2 Nov 2023 12:46:33 +0100 Subject: [PATCH 05/53] remove notebook from repository --- notebooks/turntaking.ipynb | 69 -------------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 notebooks/turntaking.ipynb diff --git a/notebooks/turntaking.ipynb b/notebooks/turntaking.ipynb deleted file mode 100644 index 4afe370..0000000 --- a/notebooks/turntaking.ipynb +++ /dev/null @@ -1,69 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# to ensure that the modules can be imported, as they are located in a different folder, add the package root to the path:\n", - "\n", - "import sys\n", - "sys.path.insert(0, \"../\")\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sktalk.turndynamics as td" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "corpora_list = []\n", - "\n", - "metadatafile = pd.read_csv(\"_overview.csv\", encoding=\"ISO-8859-1\", sep=',')\n", - "corpora_for_d_latest = pd.DataFrame(columns=['corpus_path','langshort','langfull'])\n", - "# loop over csv files (language corpus)\n", - "for index, row in metadatafile.iterrows():\n", - " if row[\"ElPaCo_included\"] == \"yes\":\n", - " corpus_name = row[\"File_name\"]\n", - " corpus_path = './Elpaco dataset/'+corpus_name+'.csv'\n", - " langshort = row[\"langshort\"]\n", - " langfull = row[\"Langfull\"]\n", - " corpora_list.append([corpus_path, langshort,langfull])\n", - "\n", - "corpora_for_d_latest = pd.DataFrame(corpora_list, columns = ['language', 'langshort', 'langfull'])\n", - "\n", - "print(corpora_for_d_latest)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10.6" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 1f77d4da14dc90af423759dc376af59ff530fcf8 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 3 Nov 2023 14:35:58 +0100 Subject: [PATCH 06/53] start adding utterance functions --- sktalk/corpus/utterance.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 14ba5ca..9f3d3cc 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -1,7 +1,11 @@ from dataclasses import asdict from dataclasses import dataclass +from math import nan from typing import Any + +from distutils.command import clean from .participant import Participant +from collections import Counter @dataclass @@ -21,3 +25,14 @@ def asdict(self): # TODO function: that prints summary of data, shows it to user # TODO function: create a pandas data frame with the utterances + + def getnchar(self): + clean_utt = self.utterance.replace(" ", "").strip() + char_count = Counter(clean_utt) + self.nchar = sum(char_count.values()) + self.length = len(clean_utt) + + def getnwords(self): + clean_utt = self.utterance.strip() + self.nwords = len(clean_utt.split(" ")) + From f76aaba5238270a78b83381493fb101e39a5dbb8 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 3 Nov 2023 18:50:15 +0100 Subject: [PATCH 07/53] add calculated fields to dataclass --- sktalk/corpus/utterance.py | 42 +++++++++++++++++++++------------- tests/corpus/test_utterance.py | 30 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 16 deletions(-) create mode 100644 tests/corpus/test_utterance.py diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 9f3d3cc..9a19232 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -1,11 +1,8 @@ +import re from dataclasses import asdict from dataclasses import dataclass -from math import nan from typing import Any - -from distutils.command import clean from .participant import Participant -from collections import Counter @dataclass @@ -17,22 +14,35 @@ class Utterance: end: str = None metadata: dict[str, Any] = None + def __post_init__(self): + # clean utterance: + # remove leading and trailing whitespace + self.utterance_clean = self.utterance.strip() + # remove square brackets and their contents, e.g. [laugh] + self.utterance_clean = re.sub(r'\[[^\]]*\]', '', self.utterance_clean) + # remove punctuation inside and outside of words + self.utterance_clean = re.sub(r'[^\w\s]', '', self.utterance_clean) + # remove numbers that are surrounded by spaces + self.utterance_clean = re.sub(r'\s[0-9]+\s', ' ', self.utterance_clean) + + # generate a list of words in the utterance + self.utterance_list = self.utterance_clean.split() + + # count words and characters + self.n_words = len(self.utterance_list) + self.n_characters = sum(len(word) for word in self.utterance_list) + def get_audio(self): pass def asdict(self): - return asdict(self) + utt_dict = asdict(self) + # add fields that are not part of the dataclass to the dictionary + newfields = [field for field in vars( + self) if field not in self.__dataclass_fields__] + utt_dict = utt_dict | {field: getattr( + self, field) for field in newfields} + return utt_dict # TODO function: that prints summary of data, shows it to user # TODO function: create a pandas data frame with the utterances - - def getnchar(self): - clean_utt = self.utterance.replace(" ", "").strip() - char_count = Counter(clean_utt) - self.nchar = sum(char_count.values()) - self.length = len(clean_utt) - - def getnwords(self): - clean_utt = self.utterance.strip() - self.nwords = len(clean_utt.split(" ")) - diff --git a/tests/corpus/test_utterance.py b/tests/corpus/test_utterance.py new file mode 100644 index 0000000..0ca0ead --- /dev/null +++ b/tests/corpus/test_utterance.py @@ -0,0 +1,30 @@ +import pytest +from sktalk.corpus.utterance import Utterance + + +class TestUtterance(): + @pytest.mark.parametrize("utt_in, nwords, nchars", [ + ("Hello world", 2, 10), + ("One", 1, 3), + ("", 0, 0), + ("Hello [laugh]", 1, 5), + ("[laugh] hello [laugh] [noise]!", 1, 5), + ("Hello 567 world", 2, 10), + ("He5lo wor4d", 2, 10), + ("zung1 ji3 jyut6", 3, 13), + ("我 我 是 上学 去, 我 现在 给 她 买 diaper@s 了 .", 12, 20) + ]) + def test_postinit(self, utt_in, nwords, nchars): + utt = Utterance( + utterance=utt_in + ) + assert utt.n_words == nwords + assert utt.n_characters == nchars + + def test_asdict(self): + utt = Utterance( + utterance="Hello world" + ) + utt_dict = utt.asdict() + assert utt_dict["utterance"] == "Hello world" + assert utt_dict["n_words"] == 2 From 3736596fc0b31121a3ea6b13390b7637b276adbb Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 7 Nov 2023 11:20:24 +0100 Subject: [PATCH 08/53] add python 3.6 to show that it breaks --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2f48cd8..0a1d7c5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'windows-latest'] - python-version: ['3.9', '3.11'] + python-version: ['3.6', '3.9', '3.11'] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} From 99a18d21cd7690b8416638e1a1eab576510d453e Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 7 Nov 2023 11:21:42 +0100 Subject: [PATCH 09/53] add python 3.8 to show that it breaks --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0a1d7c5..6536b25 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'windows-latest'] - python-version: ['3.6', '3.9', '3.11'] + python-version: ['3.8', '3.9', '3.11'] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} From 808e3e2b45dd37157169e530457a6ae829d74234 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 7 Nov 2023 11:22:42 +0100 Subject: [PATCH 10/53] add python 3.6 to show that it breaks --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6536b25..2b0a553 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'windows-latest'] - python-version: ['3.8', '3.9', '3.11'] + python-version: ['3.6.12', '3.9', '3.11'] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} From be49481370e607217c7a7ba60a15e738920f02d4 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 7 Nov 2023 11:25:54 +0100 Subject: [PATCH 11/53] add python 3.8 to show that it breaks --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2b0a553..6536b25 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'windows-latest'] - python-version: ['3.6.12', '3.9', '3.11'] + python-version: ['3.8', '3.9', '3.11'] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} From 5d8862b6fe0dab8fdf341b6e562a29dc6ce3de8e Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 7 Nov 2023 11:41:09 +0100 Subject: [PATCH 12/53] remove earlier python versions again --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6536b25..2f48cd8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'windows-latest'] - python-version: ['3.8', '3.9', '3.11'] + python-version: ['3.9', '3.11'] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} From 8d55ced95211831d66e9a7124b1b5f6dc5d18aae Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 8 Nov 2023 14:23:22 +0100 Subject: [PATCH 13/53] add fields to initial data class --- sktalk/corpus/utterance.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 9a19232..b7cebaf 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -13,6 +13,10 @@ class Utterance: begin: str = None end: str = None metadata: dict[str, Any] = None + utterance_clean: str = None + utterance_list: list[str] = None + n_words: int = None + n_characters: int = None def __post_init__(self): # clean utterance: @@ -37,11 +41,6 @@ def get_audio(self): def asdict(self): utt_dict = asdict(self) - # add fields that are not part of the dataclass to the dictionary - newfields = [field for field in vars( - self) if field not in self.__dataclass_fields__] - utt_dict = utt_dict | {field: getattr( - self, field) for field in newfields} return utt_dict # TODO function: that prints summary of data, shows it to user From ea95de36f881737cec759ac61fd264faed489b05 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 8 Nov 2023 14:41:43 +0100 Subject: [PATCH 14/53] implement until method calculating time differences between utterances --- sktalk/corpus/utterance.py | 4 ++++ tests/conftest.py | 6 ++++-- tests/corpus/test_utterance.py | 4 ++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index b7cebaf..853c956 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -43,5 +43,9 @@ def asdict(self): utt_dict = asdict(self) return utt_dict + def until(self, next_utt): + return next_utt.time[0] - self.time[1] + + # TODO function: that prints summary of data, shows it to user # TODO function: create a pandas data frame with the utterances diff --git a/tests/conftest.py b/tests/conftest.py index 4af2b6e..d483521 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,11 +26,13 @@ def convo_meta(): def convo_utts(): utterance1 = Utterance( utterance="Hello", - participant="A" + participant="A", + time = [0, 1000] ) utterance2 = Utterance( utterance="Monde", - participant="B" + participant="B", + time = [900, 1800] ) return [utterance1, utterance2] diff --git a/tests/corpus/test_utterance.py b/tests/corpus/test_utterance.py index 0ca0ead..deb9cd6 100644 --- a/tests/corpus/test_utterance.py +++ b/tests/corpus/test_utterance.py @@ -28,3 +28,7 @@ def test_asdict(self): utt_dict = utt.asdict() assert utt_dict["utterance"] == "Hello world" assert utt_dict["n_words"] == 2 + + def test_until(self, convo_utts): + utt1, utt2 = convo_utts + assert utt1.until(utt2) == -100 \ No newline at end of file From bf1bb60db2054e9c45122dae9d65dba86df25cc1 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 8 Nov 2023 15:24:48 +0100 Subject: [PATCH 15/53] inmplement subconversation and until next method at conversation object --- sktalk/corpus/conversation.py | 12 ++++++++++++ sktalk/corpus/utterance.py | 1 + tests/corpus/test_conversation.py | 3 +++ 3 files changed, 16 insertions(+) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 4ed7a5f..2dd5a74 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -68,3 +68,15 @@ def asdict(self): dict: dictionary containing Conversation metadata and Utterances """ return self._metadata | {"Utterances": [u.asdict() for u in self._utterances]} + + def subconversation(self, index): + # start with utterance + # obtain utterance context; search criteria may be time, or [i] + # create a new conversation object from this + return Conversation(self.utterances[index:index+2], self.metadata) + + @property + def until_next(self): + if len(self.utterances) != 2: + raise ValueError("Conversation must have 2 utterances") + return self.utterances[0].until(self.utterances[1]) \ No newline at end of file diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 853c956..3590be7 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -17,6 +17,7 @@ class Utterance: utterance_list: list[str] = None n_words: int = None n_characters: int = None + time_to_next: int = None def __post_init__(self): # clean utterance: diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 4d3cac7..6ea0a82 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -46,3 +46,6 @@ def test_write_json(self, my_convo, tmp_path, user_path, expected_path): my_convo_read = json.load(f) assert isinstance(my_convo_read, dict) assert my_convo_read == my_convo.asdict() + + def test_until(self, my_convo): + assert my_convo.subconversation(0).until_next == -100 \ No newline at end of file From 4edab6b3c71cfe36955695b8a34b677dbf7df2ea Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 8 Nov 2023 15:26:08 +0100 Subject: [PATCH 16/53] autopep8 --- sktalk/corpus/conversation.py | 2 +- sktalk/corpus/utterance.py | 1 - tests/conftest.py | 4 ++-- tests/corpus/test_conversation.py | 2 +- tests/corpus/test_utterance.py | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 2dd5a74..a6fbce4 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -79,4 +79,4 @@ def subconversation(self, index): def until_next(self): if len(self.utterances) != 2: raise ValueError("Conversation must have 2 utterances") - return self.utterances[0].until(self.utterances[1]) \ No newline at end of file + return self.utterances[0].until(self.utterances[1]) diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 3590be7..9286789 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -47,6 +47,5 @@ def asdict(self): def until(self, next_utt): return next_utt.time[0] - self.time[1] - # TODO function: that prints summary of data, shows it to user # TODO function: create a pandas data frame with the utterances diff --git a/tests/conftest.py b/tests/conftest.py index d483521..1998a63 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,12 +27,12 @@ def convo_utts(): utterance1 = Utterance( utterance="Hello", participant="A", - time = [0, 1000] + time=[0, 1000] ) utterance2 = Utterance( utterance="Monde", participant="B", - time = [900, 1800] + time=[900, 1800] ) return [utterance1, utterance2] diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 6ea0a82..394b5f3 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -48,4 +48,4 @@ def test_write_json(self, my_convo, tmp_path, user_path, expected_path): assert my_convo_read == my_convo.asdict() def test_until(self, my_convo): - assert my_convo.subconversation(0).until_next == -100 \ No newline at end of file + assert my_convo.subconversation(0).until_next == -100 diff --git a/tests/corpus/test_utterance.py b/tests/corpus/test_utterance.py index deb9cd6..ed26e66 100644 --- a/tests/corpus/test_utterance.py +++ b/tests/corpus/test_utterance.py @@ -31,4 +31,4 @@ def test_asdict(self): def test_until(self, convo_utts): utt1, utt2 = convo_utts - assert utt1.until(utt2) == -100 \ No newline at end of file + assert utt1.until(utt2) == -100 From 31a98d35a806571144d70e03783001912b250922 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Thu, 9 Nov 2023 11:10:04 +0100 Subject: [PATCH 17/53] elaborate subconversation --- sktalk/corpus/conversation.py | 4 ++-- tests/corpus/test_conversation.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index a6fbce4..2467c27 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -69,11 +69,11 @@ def asdict(self): """ return self._metadata | {"Utterances": [u.asdict() for u in self._utterances]} - def subconversation(self, index): + def subconversation(self, index: int, before: int = 0, after: int = 0): # start with utterance # obtain utterance context; search criteria may be time, or [i] # create a new conversation object from this - return Conversation(self.utterances[index:index+2], self.metadata) + return Conversation(self.utterances[index-before:index+after+1], self.metadata) @property def until_next(self): diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 394b5f3..9788a2c 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -48,4 +48,4 @@ def test_write_json(self, my_convo, tmp_path, user_path, expected_path): assert my_convo_read == my_convo.asdict() def test_until(self, my_convo): - assert my_convo.subconversation(0).until_next == -100 + assert my_convo.subconversation(index=0, after=1).until_next == -100 From 642c94110be0110526929369f536999e4e021d3d Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 10 Nov 2023 10:09:59 +0100 Subject: [PATCH 18/53] move time processing to utterance --- sktalk/corpus/parsing/cha.py | 12 ------------ sktalk/corpus/parsing/parser.py | 13 ------------- sktalk/corpus/utterance.py | 29 ++++++++++++++++++++++++++++- tests/corpus/parsing/test_cha.py | 28 ---------------------------- tests/corpus/test_utterance.py | 29 +++++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 54 deletions(-) diff --git a/sktalk/corpus/parsing/cha.py b/sktalk/corpus/parsing/cha.py index 7027412..b2a70fc 100644 --- a/sktalk/corpus/parsing/cha.py +++ b/sktalk/corpus/parsing/cha.py @@ -29,24 +29,12 @@ def _to_utterance(chat_utterance) -> Utterance: time=chat_utterance.time_marks, utterance=str(chat_utterance.tiers), ) - utterance.begin, utterance.end = ChaFile._split_time(utterance.time) utterance.utterance = ChaFile._clean_utterance(utterance.utterance) return utterance def _extract_metadata(self): return self._pla_reader().headers()[0] - @staticmethod - def _split_time(time): - if time is None: - return None, None - begin, end = str(time).split(", ") - begin = begin.replace("(", "") - end = end.replace(")", "") - begin = InputFile._to_timestamp(begin) - end = InputFile._to_timestamp(end) - return (begin, end) - @staticmethod def _clean_utterance(utterance): utterance = str(utterance) diff --git a/sktalk/corpus/parsing/parser.py b/sktalk/corpus/parsing/parser.py index 039b05e..3d3d4fd 100644 --- a/sktalk/corpus/parsing/parser.py +++ b/sktalk/corpus/parsing/parser.py @@ -24,19 +24,6 @@ def metadata(self): def _extract_metadata(self): return {} - @staticmethod - def _to_timestamp(time_ms): - try: - time_ms = float(time_ms) - except ValueError: - return None - if time_ms > 86399999: - raise ValueError(f"timestamp {time_ms} exceeds 24h") - if time_ms < 0: - raise ValueError(f"timestamp {time_ms} negative") - time_dt = datetime.datetime.utcfromtimestamp(time_ms/1000) - return time_dt.strftime("%H:%M:%S.%f")[:-3] - @classmethod def download(cls, url): # noqa: W0613 # download diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 9286789..1efb5c9 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -1,3 +1,4 @@ +import datetime import re from dataclasses import asdict from dataclasses import dataclass @@ -9,7 +10,7 @@ class Utterance: utterance: str participant: Participant = None - time: str = None + time: list = None begin: str = None end: str = None metadata: dict[str, Any] = None @@ -37,6 +38,9 @@ def __post_init__(self): self.n_words = len(self.utterance_list) self.n_characters = sum(len(word) for word in self.utterance_list) + # calculate timestamps + self.begin, self.end = self._split_time(self.time) + def get_audio(self): pass @@ -47,5 +51,28 @@ def asdict(self): def until(self, next_utt): return next_utt.time[0] - self.time[1] + def _split_time(self, time: list): + if time is None: + return None, None + begin, end = str(time).split(", ") + begin = begin.replace("(", "") + end = end.replace(")", "") + begin = self._to_timestamp(begin) + end = self._to_timestamp(end) + return (begin, end) + + @staticmethod + def _to_timestamp(time_ms): + try: + time_ms = float(time_ms) + except ValueError: + return None + if time_ms > 86399999: + raise ValueError(f"timestamp {time_ms} exceeds 24h") + if time_ms < 0: + raise ValueError(f"timestamp {time_ms} negative") + time_dt = datetime.datetime.utcfromtimestamp(time_ms/1000) + return time_dt.strftime("%H:%M:%S.%f")[:-3] + # TODO function: that prints summary of data, shows it to user # TODO function: create a pandas data frame with the utterances diff --git a/tests/corpus/parsing/test_cha.py b/tests/corpus/parsing/test_cha.py index 2cf10f0..06d470a 100644 --- a/tests/corpus/parsing/test_cha.py +++ b/tests/corpus/parsing/test_cha.py @@ -7,25 +7,6 @@ from sktalk.corpus.parsing.parser import InputFile -class TestParser: - milliseconds_timestamp = [ - ["0", "00:00:00.000"], - ["1706326", "00:28:26.326"], - ["222222", "00:03:42.222"], - ["None", None] - ] - - @pytest.mark.parametrize("milliseconds, timestamp", milliseconds_timestamp) - def test_to_timestamp(self, milliseconds, timestamp): - assert InputFile._to_timestamp(milliseconds) == timestamp # noqa: W0212 - - with pytest.raises(ValueError, match="exceeds 24h"): - InputFile._to_timestamp("987654321") # noqa: W0212 - - with pytest.raises(ValueError, match="negative"): - InputFile._to_timestamp("-1") # noqa: W0212 - - class TestChaFile: urls = [ "https://ca.talkbank.org/data-orig/GCSAusE/01.cha", @@ -62,15 +43,6 @@ def test_parse(self, download_file): assert language == ["eng"] # TODO assert that there are no empty utterances - def test_split_time(self): - time = "(1748070, 1751978)" - begin_end = ("00:29:08.070", "00:29:11.978") - assert ChaFile._split_time(time) == begin_end # noqa: W0212 - - time = None - begin_end = (None, None) - assert ChaFile._split_time(time) == begin_end # noqa: W0212 - unclean_clean = [ [ r"{'SAM': 'que (0.5) e(u) gosto \x151790561_1793421\x15 (0.2)→'}", diff --git a/tests/corpus/test_utterance.py b/tests/corpus/test_utterance.py index ed26e66..c456818 100644 --- a/tests/corpus/test_utterance.py +++ b/tests/corpus/test_utterance.py @@ -32,3 +32,32 @@ def test_asdict(self): def test_until(self, convo_utts): utt1, utt2 = convo_utts assert utt1.until(utt2) == -100 + + milliseconds_timestamp = [ + ["0", "00:00:00.000"], + ["1706326", "00:28:26.326"], + ["222222", "00:03:42.222"], + ["None", None] + ] + + @pytest.mark.parametrize("milliseconds, timestamp", milliseconds_timestamp) + def test_to_timestamp(self, milliseconds, timestamp): + utt = Utterance(utterance="") + assert utt._to_timestamp(milliseconds) == timestamp # noqa: W0212 + + def test_to_timestamp_errors(self): + utt = Utterance(utterance="") + with pytest.raises(ValueError, match="exceeds 24h"): + utt._to_timestamp("987654321") # noqa: W0212 + + with pytest.raises(ValueError, match="negative"): + Utterance._to_timestamp("-1") # noqa: W0212 + + time_begin_end = [[(1748070, 1751978), "00:29:08.070", "00:29:11.978"], + [None, None, None]] + + @pytest.mark.parametrize("time, begin, end", time_begin_end) + def test_split_time(self, time, begin, end): + utt = Utterance(utterance="", time=time) + assert utt.begin == begin + assert utt.end == end From 308ab94d07a5b71f8f918c63b401a6b8f972aeea Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 10 Nov 2023 11:08:54 +0100 Subject: [PATCH 19/53] object oriented and small fixes --- sktalk/corpus/parsing/cha.py | 1 + sktalk/corpus/parsing/parser.py | 1 - sktalk/corpus/utterance.py | 17 ++++++++--------- tests/corpus/parsing/test_cha.py | 5 ++--- tests/corpus/test_utterance.py | 5 ++++- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/sktalk/corpus/parsing/cha.py b/sktalk/corpus/parsing/cha.py index b2a70fc..defa2ea 100644 --- a/sktalk/corpus/parsing/cha.py +++ b/sktalk/corpus/parsing/cha.py @@ -30,6 +30,7 @@ def _to_utterance(chat_utterance) -> Utterance: utterance=str(chat_utterance.tiers), ) utterance.utterance = ChaFile._clean_utterance(utterance.utterance) + utterance.time = list(utterance.time) return utterance def _extract_metadata(self): diff --git a/sktalk/corpus/parsing/parser.py b/sktalk/corpus/parsing/parser.py index 3d3d4fd..119a3a6 100644 --- a/sktalk/corpus/parsing/parser.py +++ b/sktalk/corpus/parsing/parser.py @@ -1,5 +1,4 @@ import abc -import datetime from ..conversation import Conversation diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 1efb5c9..d03d61d 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -51,15 +51,14 @@ def asdict(self): def until(self, next_utt): return next_utt.time[0] - self.time[1] - def _split_time(self, time: list): - if time is None: - return None, None - begin, end = str(time).split(", ") - begin = begin.replace("(", "") - end = end.replace(")", "") - begin = self._to_timestamp(begin) - end = self._to_timestamp(end) - return (begin, end) + def _split_time(self): + try: + begin, end = self.time + self.begin = self._to_timestamp(begin) + self.end = self._to_timestamp(end) + except (ValueError, TypeError): + self.begin = None + self.end = None @staticmethod def _to_timestamp(time_ms): diff --git a/tests/corpus/parsing/test_cha.py b/tests/corpus/parsing/test_cha.py index 06d470a..53fa736 100644 --- a/tests/corpus/parsing/test_cha.py +++ b/tests/corpus/parsing/test_cha.py @@ -2,9 +2,8 @@ import tempfile import pytest import requests -import sktalk +from sktalk.corpus.conversation import Conversation from sktalk.corpus.parsing.cha import ChaFile -from sktalk.corpus.parsing.parser import InputFile class TestChaFile: @@ -33,7 +32,7 @@ def download_file(self, request): @pytest.mark.parametrize("download_file", urls, indirect=True) def test_parse(self, download_file): parsed_cha = ChaFile(download_file).parse() - assert isinstance(parsed_cha, sktalk.corpus.conversation.Conversation) + assert isinstance(parsed_cha, Conversation) source = parsed_cha.metadata["source"] assert os.path.splitext(source)[1] == ".cha" assert parsed_cha.utterances[0].begin == "00:00:00.000" diff --git a/tests/corpus/test_utterance.py b/tests/corpus/test_utterance.py index c456818..11ea48b 100644 --- a/tests/corpus/test_utterance.py +++ b/tests/corpus/test_utterance.py @@ -51,9 +51,12 @@ def test_to_timestamp_errors(self): utt._to_timestamp("987654321") # noqa: W0212 with pytest.raises(ValueError, match="negative"): - Utterance._to_timestamp("-1") # noqa: W0212 + utt._to_timestamp("-1") # noqa: W0212 time_begin_end = [[(1748070, 1751978), "00:29:08.070", "00:29:11.978"], + [[1748070, 1751978], "00:29:08.070", "00:29:11.978"], + [[1], None, None], + [1, None, None], [None, None, None]] @pytest.mark.parametrize("time, begin, end", time_begin_end) From d5c62f96059dc6a9df41453e2a3073245448ebe0 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 10 Nov 2023 11:09:55 +0100 Subject: [PATCH 20/53] refactor post init --- sktalk/corpus/utterance.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index d03d61d..db2ff5b 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -22,24 +22,22 @@ class Utterance: def __post_init__(self): # clean utterance: - # remove leading and trailing whitespace - self.utterance_clean = self.utterance.strip() - # remove square brackets and their contents, e.g. [laugh] - self.utterance_clean = re.sub(r'\[[^\]]*\]', '', self.utterance_clean) - # remove punctuation inside and outside of words - self.utterance_clean = re.sub(r'[^\w\s]', '', self.utterance_clean) - # remove numbers that are surrounded by spaces - self.utterance_clean = re.sub(r'\s[0-9]+\s', ' ', self.utterance_clean) + if not self.utterance_clean: + self._clean_utterance() # generate a list of words in the utterance - self.utterance_list = self.utterance_clean.split() + if not self.utterance_list: + self.utterance_list = self.utterance_clean.split() # count words and characters - self.n_words = len(self.utterance_list) - self.n_characters = sum(len(word) for word in self.utterance_list) + if not self.n_words: + self.n_words = len(self.utterance_list) + if not self.n_characters: + self.n_characters = sum(len(word) for word in self.utterance_list) # calculate timestamps - self.begin, self.end = self._split_time(self.time) + if not self.begin or not self.end: + self._split_time() def get_audio(self): pass @@ -48,6 +46,16 @@ def asdict(self): utt_dict = asdict(self) return utt_dict + def _clean_utterance(self): + # remove leading and trailing whitespace + self.utterance_clean = self.utterance.strip() + # remove square brackets and their contents, e.g. [laugh] + self.utterance_clean = re.sub(r'\[[^\]]*\]', '', self.utterance_clean) + # remove punctuation inside and outside of words + self.utterance_clean = re.sub(r'[^\w\s]', '', self.utterance_clean) + # remove numbers that are surrounded by spaces + self.utterance_clean = re.sub(r'\s[0-9]+\s', ' ', self.utterance_clean) + def until(self, next_utt): return next_utt.time[0] - self.time[1] From 12a3d3a58f82e640c90245c84b913cc482566fcb Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 10 Nov 2023 16:03:05 +0100 Subject: [PATCH 21/53] add subconversation functionaliry --- sktalk/corpus/conversation.py | 24 +++++++++++- tests/conftest.py | 65 +++++++++++++++++++++++++------ tests/corpus/test_conversation.py | 65 ++++++++++++++++++++++++------- tests/corpus/test_corpus.py | 6 +-- tests/corpus/test_utterance.py | 26 +++++++------ 5 files changed, 145 insertions(+), 41 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 2467c27..3ef93c5 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -69,7 +69,29 @@ def asdict(self): """ return self._metadata | {"Utterances": [u.asdict() for u in self._utterances]} - def subconversation(self, index: int, before: int = 0, after: int = 0): + def subconversation(self, + index: int, + before: int = 0, + after: int = 0, + time_or_index: str = "index"): + # select utterance based on the value of utterance.begin + # type index = select utterance based on index + # type time = select utterance based on time + + # verify if index is within range: + if index < 0 or index >= len(self.utterances): + raise IndexError("Index out of range") + if time_or_index == "time": + begin = self.utterances[index].time[0] - before + end = self.utterances[index].time[1] + after + [u for u in self.utterances if u.time[0] >= begin and u.time[1] <= end] + elif time_or_index == "index": + # check if selection is within range + if index - before < 0 or index + after + 1 > len(self.utterances): + raise IndexError("Index out of range") + else: + raise ValueError("time_or_index must be either 'time' or 'index'") + # start with utterance # obtain utterance context; search criteria may be time, or [i] # create a new conversation object from this diff --git a/tests/conftest.py b/tests/conftest.py index 1998a63..61a2c03 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,21 +24,62 @@ def convo_meta(): @pytest.fixture def convo_utts(): - utterance1 = Utterance( - utterance="Hello", - participant="A", - time=[0, 1000] - ) - utterance2 = Utterance( - utterance="Monde", - participant="B", - time=[900, 1800] - ) - return [utterance1, utterance2] + return [ + Utterance( + utterance="Hello A", + participant="A", + time=[0, 1000] + ), + Utterance( + utterance="Monde B", + participant="B", + time=[900, 3500] + ), + Utterance( + utterance="Hello C", + participant="A", + time=[1000, 12000] + ), + Utterance( + utterance="Monde D", + participant="B", + time=[1200, 2000] + ), + Utterance( + utterance="Hello E", + participant="A", + time=[3500, 4500] + ), + Utterance( + utterance="Utterance U", + participant="B", + time=[5000, 8000] + ), + Utterance( + utterance="Monde F", + participant="B", + time=[5500, 7500] + ), + Utterance( + utterance="Hello G", + participant="A", + time=[7000, 9500] + ), + Utterance( + utterance="Monde H", + participant="B", + time=[9000, 12500] + ), + Utterance( + utterance="Hello I", + participant="A", + time=[12000, 13000] + ) + ] @pytest.fixture -def my_convo(convo_utts, convo_meta): +def convo(convo_utts, convo_meta): return Conversation(convo_utts, convo_meta) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 9788a2c..4c46c89 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -1,13 +1,14 @@ import json import os +from contextlib import nullcontext as does_not_raise import pytest from sktalk.corpus.conversation import Conversation class TestConversation: - def test_instantiate(self, my_convo, convo_utts, convo_meta): + def test_instantiate(self, convo, convo_utts, convo_meta): # test the conversation fixture - assert isinstance(my_convo, Conversation) + assert isinstance(convo, Conversation) # test instantiation of a new conversation with content new_convo = Conversation(utterances=convo_utts, metadata=convo_meta) @@ -27,25 +28,63 @@ def test_instantiate(self, my_convo, convo_utts, convo_meta): with pytest.warns(match="no Utterances"): Conversation(utterances=[]) - def test_asdict(self, my_convo): + def test_asdict(self, convo): """Verify content of dictionary based on conversation""" - convodict = my_convo.asdict() - assert convodict["Utterances"][0] == my_convo.utterances[0].asdict() - assert convodict["source"] == my_convo.metadata["source"] + convodict = convo.asdict() + assert convodict["Utterances"][0] == convo.utterances[0].asdict() + assert convodict["source"] == convo.metadata["source"] @pytest.mark.parametrize("user_path, expected_path", [ ("tmp_convo.json", "tmp_convo.json"), ("tmp_convo", "tmp_convo.json") ]) - def test_write_json(self, my_convo, tmp_path, user_path, expected_path): + def test_write_json(self, convo, tmp_path, user_path, expected_path): tmp_file = f"{str(tmp_path)}{os.sep}{user_path}" - my_convo.write_json(tmp_file) + convo.write_json(tmp_file) tmp_file_exp = f"{str(tmp_path)}{os.sep}{expected_path}" assert os.path.exists(tmp_file_exp) with open(tmp_file_exp, encoding='utf-8') as f: - my_convo_read = json.load(f) - assert isinstance(my_convo_read, dict) - assert my_convo_read == my_convo.asdict() + convo_read = json.load(f) + assert isinstance(convo_read, dict) + assert convo_read == convo.asdict() - def test_until(self, my_convo): - assert my_convo.subconversation(index=0, after=1).until_next == -100 + +class TestConversationMetrics: + @pytest.mark.parametrize("index, before, after, time_or_index, error", + [ + (0, 0, 1, "index", does_not_raise()), + (0, 1, 1, "index", pytest.raises(IndexError)), + (9, 1, 0, "index", does_not_raise()), + (9, 1, 1, "index", pytest.raises(IndexError)), + (0, 0, 0, "neither_time_nor_index", + pytest.raises(ValueError)) + ]) + def test_subconversation_errors(self, convo, index, before, after, time_or_index, error): + with error: + convo.subconversation(index=index, + before=before, + after=after, + time_or_index=time_or_index) + + @pytest.mark.parametrize("index, before, after, time_or_index, expected_length", + [ + (0, 0, 1, "index", 2), + (5, 2, 0, "index", 3), + (1, 1000, 0, "time", 2), + (5, 3000, 3000, "time", 7), + ]) + def test_subconversation(self, convo, index, before, after, time_or_index, expected_length): + sub = convo.subconversation(index=index, + before=before, + after=after, + time_or_index=time_or_index) + assert isinstance(sub, Conversation) + assert len(sub.utterances) == expected_length + + @pytest.mark.parametrize("index, before, after, time_or_index, expected", + [(0, 0, 1, "index", -100)]) + def test_until(self, convo, index, before, after, time_or_index, expected): + assert convo.subconversation(index=index, + before=before, + after=after, + time_or_index=time_or_index).until_next == expected diff --git a/tests/corpus/test_corpus.py b/tests/corpus/test_corpus.py index 66f3efb..1bd4349 100644 --- a/tests/corpus/test_corpus.py +++ b/tests/corpus/test_corpus.py @@ -34,10 +34,10 @@ def test_init(self, conversations, metadata, error): assert isinstance(corpus.conversations, list) assert conversations is None or corpus.conversations == conversations - def test_append(self, my_corpus, my_convo): + def test_append(self, my_corpus, convo): # conversation can be added to an existing corpus - my_corpus.append(my_convo) - assert my_corpus.conversations[-1] == my_convo + my_corpus.append(convo) + assert my_corpus.conversations[-1] == convo # it is not possible to add non-conversation objects to a corpus with pytest.raises(TypeError, match="type Conversation"): diff --git a/tests/corpus/test_utterance.py b/tests/corpus/test_utterance.py index 11ea48b..861e866 100644 --- a/tests/corpus/test_utterance.py +++ b/tests/corpus/test_utterance.py @@ -3,23 +3,25 @@ class TestUtterance(): - @pytest.mark.parametrize("utt_in, nwords, nchars", [ - ("Hello world", 2, 10), - ("One", 1, 3), - ("", 0, 0), - ("Hello [laugh]", 1, 5), - ("[laugh] hello [laugh] [noise]!", 1, 5), - ("Hello 567 world", 2, 10), - ("He5lo wor4d", 2, 10), - ("zung1 ji3 jyut6", 3, 13), - ("我 我 是 上学 去, 我 现在 给 她 买 diaper@s 了 .", 12, 20) + @pytest.mark.parametrize("utt_in, nwords, nchars, uttlist", [ + ("Hello world", 2, 10, ["Hello", "world"]), + ("One", 1, 3, ["One"]), + ("", 0, 0, []), + ("Hello [laugh]", 1, 5, ["Hello"]), + ("[laugh] hello [laugh] [noise]!", 1, 5, ["hello"]), + ("Hello 567 world", 2, 10, ["Hello", "world"]), + ("He5lo wor4d", 2, 10, ["He5lo", "wor4d"]), + ("zung1 ji3 jyut6", 3, 13, ["zung1", "ji3", "jyut6"]), + ("上学 去, 我 现在 diaper@s 了 .", 6, 14, + ["上学", "去", "我", "现在", "diapers", "了"]) ]) - def test_postinit(self, utt_in, nwords, nchars): + def test_postinit(self, utt_in, nwords, nchars, uttlist): utt = Utterance( utterance=utt_in ) assert utt.n_words == nwords assert utt.n_characters == nchars + assert utt.utterance_list == uttlist def test_asdict(self): utt = Utterance( @@ -30,7 +32,7 @@ def test_asdict(self): assert utt_dict["n_words"] == 2 def test_until(self, convo_utts): - utt1, utt2 = convo_utts + utt1, utt2 = convo_utts[:2] assert utt1.until(utt2) == -100 milliseconds_timestamp = [ From 1fb7323ac62b9caed0b66e912ce8b12252f3d402 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 14 Nov 2023 11:28:43 +0100 Subject: [PATCH 22/53] subconversation can select based on index or time --- sktalk/corpus/conversation.py | 60 ++++++++++++++++++++++--------- tests/conftest.py | 4 +-- tests/corpus/test_conversation.py | 26 +++++++++++--- 3 files changed, 66 insertions(+), 24 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 3ef93c5..383cb83 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -5,7 +5,7 @@ class Conversation(Writer): def __init__( - self, utterances: list["Utterance"], metadata: dict = None # noqa: F821 + self, utterances: list["Utterance"], metadata: dict | None = None # noqa: F821 ) -> None: """Representation of a transcribed conversation @@ -72,33 +72,59 @@ def asdict(self): def subconversation(self, index: int, before: int = 0, - after: int = 0, - time_or_index: str = "index"): - # select utterance based on the value of utterance.begin - # type index = select utterance based on index - # type time = select utterance based on time + after: int | None = None, + time_or_index: str = "index") -> "Conversation": + """Select utterances to provide context as a sub-conversation - # verify if index is within range: + Args: + index (int): The index of the utterance for which to provide context + before (int, optional): Either the number of utterances prior to indicated utterance, or the time in ms preceding the utterance's begin. Defaults to 0. + after (int, optional): Either the number of utterances after the indicated utterance, or the time in ms following the utterance's end. Defaults to None, which then assumes the same value as `before`. + time_or_index (str, optional): Use "time" to select based on time (in ms), or "index" to select a set number of utterances irrespective of timing. Defaults to "index". + + Raises: + IndexError: Index provided must be within range of utterances + ValueError: time_or_index must be either "time" or "index" + + Returns: + Conversation: Conversation object containing a reduced set of utterances + """ if index < 0 or index >= len(self.utterances): raise IndexError("Index out of range") - if time_or_index == "time": + if after is None: + after = before + if time_or_index == "index": + # if before/after would exceed the bounds of the list, adjust + if index - before < 0: + before = index + if index + after + 1 > len(self.utterances): + after = len(self.utterances) - index - 1 + returned_utterances = self.utterances[index-before:index+after+1] + elif time_or_index == "time": begin = self.utterances[index].time[0] - before end = self.utterances[index].time[1] + after - [u for u in self.utterances if u.time[0] >= begin and u.time[1] <= end] - elif time_or_index == "index": - # check if selection is within range - if index - before < 0 or index + after + 1 > len(self.utterances): - raise IndexError("Index out of range") + returned_utterances = [ + u for u in self.utterances if self.overlap(begin, end, u.time)] else: raise ValueError("time_or_index must be either 'time' or 'index'") - # start with utterance - # obtain utterance context; search criteria may be time, or [i] - # create a new conversation object from this - return Conversation(self.utterances[index-before:index+after+1], self.metadata) + return Conversation(returned_utterances, self.metadata) @property def until_next(self): if len(self.utterances) != 2: raise ValueError("Conversation must have 2 utterances") return self.utterances[0].until(self.utterances[1]) + + @staticmethod + def overlap(begin: int, end: int, time: list): + # there is overlap if: + # time[0] falls between begin and end + # time[1] falls between and end + # time[0] is before begin and time[1] is after end + if begin <= time[0] <= end or begin <= time[1] <= end: + return True + elif time[0] <= begin and time[1] >= end: + return True + else: + return False diff --git a/tests/conftest.py b/tests/conftest.py index 61a2c03..32c30ae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,12 +38,12 @@ def convo_utts(): Utterance( utterance="Hello C", participant="A", - time=[1000, 12000] + time=[1001, 12000] ), Utterance( utterance="Monde D", participant="B", - time=[1200, 2000] + time=[1200, 1999] ), Utterance( utterance="Hello E", diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 4c46c89..d84c7e7 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -53,9 +53,8 @@ class TestConversationMetrics: @pytest.mark.parametrize("index, before, after, time_or_index, error", [ (0, 0, 1, "index", does_not_raise()), - (0, 1, 1, "index", pytest.raises(IndexError)), - (9, 1, 0, "index", does_not_raise()), - (9, 1, 1, "index", pytest.raises(IndexError)), + (20, 1, 1, "index", pytest.raises(IndexError)), + (0, 50, 50, "index", does_not_raise()), (0, 0, 0, "neither_time_nor_index", pytest.raises(ValueError)) ]) @@ -70,8 +69,11 @@ def test_subconversation_errors(self, convo, index, before, after, time_or_index [ (0, 0, 1, "index", 2), (5, 2, 0, "index", 3), - (1, 1000, 0, "time", 2), - (5, 3000, 3000, "time", 7), + (0, 2, 2, "index", 3), + (0, 2, None, "index", 3), + (0, 0, 0, "time", 2), # A, B + (5, 3000, 3000, "time", 7), # B,C,E,U,F,G,H + (5, 0, 0, "time", 4), # C, U, F, G ]) def test_subconversation(self, convo, index, before, after, time_or_index, expected_length): sub = convo.subconversation(index=index, @@ -88,3 +90,17 @@ def test_until(self, convo, index, before, after, time_or_index, expected): before=before, after=after, time_or_index=time_or_index).until_next == expected + + def test_overlap(self): + # entire utterance in window + assert Conversation.overlap(80, 120, [90, 110]) + # beginning of utterance in window + assert Conversation.overlap(80, 100, [90, 110]) + # end of utterance in window + assert Conversation.overlap(100, 120, [90, 110]) + # utterance covers window entirely + assert Conversation.overlap(95, 105, [90, 110]) + assert not Conversation.overlap( + 120, 140, [90, 110]) # utterance before window + assert not Conversation.overlap( + 70, 80, [90, 110]) # utterance after window From 51f1a9eb2faa7aebc7cfb7b0a0e9d4fc75c6aff6 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 14 Nov 2023 11:59:51 +0100 Subject: [PATCH 23/53] make linter happy --- sktalk/corpus/conversation.py | 21 ++++++++++++--------- tests/corpus/test_conversation.py | 13 +++++++------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 383cb83..48521f8 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -1,11 +1,12 @@ import warnings +from typing import Optional from .utterance import Utterance from .write.writer import Writer class Conversation(Writer): def __init__( - self, utterances: list["Utterance"], metadata: dict | None = None # noqa: F821 + self, utterances: list["Utterance"], metadata: Optional[dict] = None # noqa: F821 ) -> None: """Representation of a transcribed conversation @@ -72,15 +73,20 @@ def asdict(self): def subconversation(self, index: int, before: int = 0, - after: int | None = None, + after: Optional[int] = None, time_or_index: str = "index") -> "Conversation": """Select utterances to provide context as a sub-conversation Args: index (int): The index of the utterance for which to provide context - before (int, optional): Either the number of utterances prior to indicated utterance, or the time in ms preceding the utterance's begin. Defaults to 0. - after (int, optional): Either the number of utterances after the indicated utterance, or the time in ms following the utterance's end. Defaults to None, which then assumes the same value as `before`. - time_or_index (str, optional): Use "time" to select based on time (in ms), or "index" to select a set number of utterances irrespective of timing. Defaults to "index". + before (int, optional): Either the number of utterances prior to indicated utterance, + or the time in ms preceding the utterance's begin. Defaults to 0. + after (int, optional): Either the number of utterances after the indicated utterance, + or the time in ms following the utterance's end. Defaults to None, + which then assumes the same value as `before`. + time_or_index (str, optional): Use "time" to select based on time (in ms), or "index" + to select a set number of utterances irrespective of timing. + Defaults to "index". Raises: IndexError: Index provided must be within range of utterances @@ -124,7 +130,4 @@ def overlap(begin: int, end: int, time: list): # time[0] is before begin and time[1] is after end if begin <= time[0] <= end or begin <= time[1] <= end: return True - elif time[0] <= begin and time[1] >= end: - return True - else: - return False + return time[0] <= begin and time[1] >= end diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index d84c7e7..c8d6442 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -50,15 +50,16 @@ def test_write_json(self, convo, tmp_path, user_path, expected_path): class TestConversationMetrics: - @pytest.mark.parametrize("index, before, after, time_or_index, error", + @pytest.mark.parametrize("args, error", [ - (0, 0, 1, "index", does_not_raise()), - (20, 1, 1, "index", pytest.raises(IndexError)), - (0, 50, 50, "index", does_not_raise()), - (0, 0, 0, "neither_time_nor_index", + ([0, 0, 1, "index"], does_not_raise()), + ([20, 1, 1, "index"], pytest.raises(IndexError)), + ([0, 50, 50, "index"], does_not_raise()), + ([0, 0, 0, "neither_time_nor_index"], pytest.raises(ValueError)) ]) - def test_subconversation_errors(self, convo, index, before, after, time_or_index, error): + def test_subconversation_errors(self, convo, args, error): + index, before, after, time_or_index = args with error: convo.subconversation(index=index, before=before, From f7db0f66725e0be4099ac9bdcc9124eec15dba56 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 14 Nov 2023 12:14:22 +0100 Subject: [PATCH 24/53] also pack arguments for second subconversation test --- tests/corpus/test_conversation.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index c8d6442..5cdba01 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -66,17 +66,18 @@ def test_subconversation_errors(self, convo, args, error): after=after, time_or_index=time_or_index) - @pytest.mark.parametrize("index, before, after, time_or_index, expected_length", + @pytest.mark.parametrize("args, expected_length", [ - (0, 0, 1, "index", 2), - (5, 2, 0, "index", 3), - (0, 2, 2, "index", 3), - (0, 2, None, "index", 3), - (0, 0, 0, "time", 2), # A, B - (5, 3000, 3000, "time", 7), # B,C,E,U,F,G,H - (5, 0, 0, "time", 4), # C, U, F, G + ([0, 0, 1, "index"], 2), + ([5, 2, 0, "index"], 3), + ([0, 2, 2, "index"], 3), + ([0, 2, None, "index"], 3), + ([0, 0, 0, "time"], 2), # A, B + ([5, 3000, 3000, "time"], 7), # B,C,E,U,F,G,H + ([5, 0, 0, "time"], 4), # C, U, F, G ]) - def test_subconversation(self, convo, index, before, after, time_or_index, expected_length): + def test_subconversation(self, convo, args, expected_length): + index, before, after, time_or_index = args sub = convo.subconversation(index=index, before=before, after=after, From 47dc536e05eedd11c31a8f8ad7d32c8baca4f420 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 14 Nov 2023 12:20:25 +0100 Subject: [PATCH 25/53] fix linting issues --- sktalk/corpus/utterance.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index db2ff5b..0805305 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -3,22 +3,22 @@ from dataclasses import asdict from dataclasses import dataclass from typing import Any -from .participant import Participant +from typing import Optional @dataclass class Utterance: utterance: str - participant: Participant = None - time: list = None - begin: str = None - end: str = None - metadata: dict[str, Any] = None - utterance_clean: str = None - utterance_list: list[str] = None - n_words: int = None - n_characters: int = None - time_to_next: int = None + participant: Optional[str] = None + time: Optional[list] = None + begin: Optional[str] = None + end: Optional[str] = None + metadata: Optional[dict[str, Any]] = None + utterance_clean: Optional[str] = None + utterance_list: Optional[list[str]] = None + n_words: Optional[int] = None + n_characters: Optional[int] = None + time_to_next: Optional[int] = None def __post_init__(self): # clean utterance: @@ -43,8 +43,7 @@ def get_audio(self): pass def asdict(self): - utt_dict = asdict(self) - return utt_dict + return asdict(self) def _clean_utterance(self): # remove leading and trailing whitespace @@ -54,7 +53,7 @@ def _clean_utterance(self): # remove punctuation inside and outside of words self.utterance_clean = re.sub(r'[^\w\s]', '', self.utterance_clean) # remove numbers that are surrounded by spaces - self.utterance_clean = re.sub(r'\s[0-9]+\s', ' ', self.utterance_clean) + self.utterance_clean = re.sub(r'\s\d+\s', ' ', self.utterance_clean) def until(self, next_utt): return next_utt.time[0] - self.time[1] From aaff48af3f33087c4fdda9ad6d7dfa4e2518858f Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 14 Nov 2023 13:08:42 +0100 Subject: [PATCH 26/53] add dyadic property --- sktalk/corpus/conversation.py | 5 +++++ tests/conftest.py | 2 +- tests/corpus/test_conversation.py | 7 +++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 48521f8..410f774 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -122,6 +122,11 @@ def until_next(self): raise ValueError("Conversation must have 2 utterances") return self.utterances[0].until(self.utterances[1]) + @property + def dyadic(self) -> bool: + participants = [u.participant for u in self.utterances] + return len(set(participants)) == 2 + @staticmethod def overlap(begin: int, end: int, time: list): # there is overlap if: diff --git a/tests/conftest.py b/tests/conftest.py index 32c30ae..cff9c06 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -72,7 +72,7 @@ def convo_utts(): ), Utterance( utterance="Hello I", - participant="A", + participant="C", time=[12000, 13000] ) ] diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 5cdba01..a094c01 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -106,3 +106,10 @@ def test_overlap(self): 120, 140, [90, 110]) # utterance before window assert not Conversation.overlap( 70, 80, [90, 110]) # utterance after window + + def test_dyadic(self, convo): + assert not convo.dyadic + convo2 = convo.subconversation(0, 2) + assert convo2.dyadic + convo3 = convo.subconversation(0) + assert not convo3.dyadic \ No newline at end of file From 7ee747fbf5da3bc06cc795c092290ebd5ccbd4d9 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Thu, 16 Nov 2023 16:11:24 +0100 Subject: [PATCH 27/53] apply conversation wide calculations dyadic and time to nxt --- sktalk/corpus/conversation.py | 54 +++++++++++++++++++++++-------- sktalk/corpus/parsing/cha.py | 5 ++- sktalk/corpus/utterance.py | 1 + tests/conftest.py | 2 +- tests/corpus/test_conversation.py | 30 ++++++++++------- 5 files changed, 65 insertions(+), 27 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 410f774..9983f76 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -107,32 +107,60 @@ def subconversation(self, after = len(self.utterances) - index - 1 returned_utterances = self.utterances[index-before:index+after+1] elif time_or_index == "time": - begin = self.utterances[index].time[0] - before - end = self.utterances[index].time[1] + after - returned_utterances = [ - u for u in self.utterances if self.overlap(begin, end, u.time)] + try: + begin = self.utterances[index].time[0] - before + end = self.utterances[index].time[1] + after + returned_utterances = [ + u for u in self.utterances if self.overlap(begin, end, u.time)] + except (TypeError, IndexError): + return Conversation([], self.metadata) else: - raise ValueError("time_or_index must be either 'time' or 'index'") + raise ValueError( + "`time_or_index` must be either 'time' or 'index'") + # TODO should metadata be part of this? return Conversation(returned_utterances, self.metadata) - @property - def until_next(self): - if len(self.utterances) != 2: - raise ValueError("Conversation must have 2 utterances") - return self.utterances[0].until(self.utterances[1]) + def _time_to_next(self) -> int: + # if len(self.utterances) != 2: + # return None + try: + return self.utterances[0].until(self.utterances[1]) + except (TypeError, IndexError): + return None - @property - def dyadic(self) -> bool: + def _dyadic(self) -> bool: participants = [u.participant for u in self.utterances] return len(set(participants)) == 2 + CONVERSATION_FUNCTIONS = { + "dyadic": _dyadic, + "time_to_next": _time_to_next, + } + + def apply(self, field, **kwargs): + """ + Apply a function to each utterance in the conversation + + Args: + func (function): function to apply to each utterance + field (str): field to update + """ + func = self.CONVERSATION_FUNCTIONS[field] + + for index, utterance in enumerate(self.utterances): + sub = self.subconversation(index=index, **kwargs) + value = func(sub) + utterance.__setattr__(field, value) + @staticmethod def overlap(begin: int, end: int, time: list): # there is overlap if: # time[0] falls between begin and end # time[1] falls between and end # time[0] is before begin and time[1] is after end - if begin <= time[0] <= end or begin <= time[1] <= end: + if time is None: + return False + elif begin <= time[0] <= end or begin <= time[1] <= end: return True return time[0] <= begin and time[1] >= end diff --git a/sktalk/corpus/parsing/cha.py b/sktalk/corpus/parsing/cha.py index defa2ea..50999d1 100644 --- a/sktalk/corpus/parsing/cha.py +++ b/sktalk/corpus/parsing/cha.py @@ -30,7 +30,10 @@ def _to_utterance(chat_utterance) -> Utterance: utterance=str(chat_utterance.tiers), ) utterance.utterance = ChaFile._clean_utterance(utterance.utterance) - utterance.time = list(utterance.time) + try: + utterance.time = list(utterance.time) + except TypeError: + utterance.time = None return utterance def _extract_metadata(self): diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 0805305..6712674 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -19,6 +19,7 @@ class Utterance: n_words: Optional[int] = None n_characters: Optional[int] = None time_to_next: Optional[int] = None + dyadic: Optional[bool] = None def __post_init__(self): # clean utterance: diff --git a/tests/conftest.py b/tests/conftest.py index cff9c06..857d59e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -63,7 +63,7 @@ def convo_utts(): Utterance( utterance="Hello G", participant="A", - time=[7000, 9500] + time=None ), Utterance( utterance="Monde H", diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index a094c01..d3d5d6c 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -73,8 +73,8 @@ def test_subconversation_errors(self, convo, args, error): ([0, 2, 2, "index"], 3), ([0, 2, None, "index"], 3), ([0, 0, 0, "time"], 2), # A, B - ([5, 3000, 3000, "time"], 7), # B,C,E,U,F,G,H - ([5, 0, 0, "time"], 4), # C, U, F, G + ([5, 3000, 3000, "time"], 6), # B,C,E,U,F,H + ([5, 0, 0, "time"], 3), # C, U, F ]) def test_subconversation(self, convo, args, expected_length): index, before, after, time_or_index = args @@ -85,13 +85,13 @@ def test_subconversation(self, convo, args, expected_length): assert isinstance(sub, Conversation) assert len(sub.utterances) == expected_length - @pytest.mark.parametrize("index, before, after, time_or_index, expected", - [(0, 0, 1, "index", -100)]) - def test_until(self, convo, index, before, after, time_or_index, expected): - assert convo.subconversation(index=index, - before=before, - after=after, - time_or_index=time_or_index).until_next == expected + # @pytest.mark.parametrize("index, before, after, time_or_index, expected", + # [(0, 0, 1, "index", -100)]) + # def test_until(self, convo, index, before, after, time_or_index, expected): + # assert convo.subconversation(index=index, + # before=before, + # after=after, + # time_or_index=time_or_index)._time_to_next == expected def test_overlap(self): # entire utterance in window @@ -108,8 +108,14 @@ def test_overlap(self): 70, 80, [90, 110]) # utterance after window def test_dyadic(self, convo): - assert not convo.dyadic + assert not convo._dyadic() convo2 = convo.subconversation(0, 2) - assert convo2.dyadic + assert convo2._dyadic() convo3 = convo.subconversation(0) - assert not convo3.dyadic \ No newline at end of file + assert not convo3._dyadic() + + def test_apply_dyadic(self, convo): + convo.apply("dyadic", before=1) + assert convo.utterances[0].dyadic + assert convo.utterances[2].dyadic + assert not convo.utterances[8].dyadic From c0593cb352488f507f9dc06039531ffa4a5be54a Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:03:20 +0100 Subject: [PATCH 28/53] subconversation is internal --- sktalk/corpus/conversation.py | 12 +++++++----- tests/corpus/test_conversation.py | 24 ++++++++---------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 9983f76..0b90abf 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -70,11 +70,11 @@ def asdict(self): """ return self._metadata | {"Utterances": [u.asdict() for u in self._utterances]} - def subconversation(self, - index: int, - before: int = 0, - after: Optional[int] = None, - time_or_index: str = "index") -> "Conversation": + def _subconversation(self, + index: int, + before: int = 0, + after: Optional[int] = None, + time_or_index: str = "index") -> "Conversation": """Select utterances to provide context as a sub-conversation Args: @@ -95,6 +95,7 @@ def subconversation(self, Returns: Conversation: Conversation object containing a reduced set of utterances """ + # TODO consider adding parameter 'strict' that only returns utterances entirely inside the window if index < 0 or index >= len(self.utterances): raise IndexError("Index out of range") if after is None: @@ -117,6 +118,7 @@ def subconversation(self, else: raise ValueError( "`time_or_index` must be either 'time' or 'index'") + return Conversation(utterances=returned_utterances) # TODO should metadata be part of this? return Conversation(returned_utterances, self.metadata) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index d3d5d6c..e90a284 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -61,10 +61,10 @@ class TestConversationMetrics: def test_subconversation_errors(self, convo, args, error): index, before, after, time_or_index = args with error: - convo.subconversation(index=index, - before=before, - after=after, - time_or_index=time_or_index) + convo._subconversation(index=index, + before=before, + after=after, + time_or_index=time_or_index) @pytest.mark.parametrize("args, expected_length", [ @@ -78,21 +78,13 @@ def test_subconversation_errors(self, convo, args, error): ]) def test_subconversation(self, convo, args, expected_length): index, before, after, time_or_index = args - sub = convo.subconversation(index=index, - before=before, - after=after, - time_or_index=time_or_index) + sub = convo._subconversation(index=index, + before=before, + after=after, + time_or_index=time_or_index) assert isinstance(sub, Conversation) assert len(sub.utterances) == expected_length - # @pytest.mark.parametrize("index, before, after, time_or_index, expected", - # [(0, 0, 1, "index", -100)]) - # def test_until(self, convo, index, before, after, time_or_index, expected): - # assert convo.subconversation(index=index, - # before=before, - # after=after, - # time_or_index=time_or_index)._time_to_next == expected - def test_overlap(self): # entire utterance in window assert Conversation.overlap(80, 120, [90, 110]) From 319ca968506de0bf459fd040a08524b92f30a5a5 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:04:33 +0100 Subject: [PATCH 29/53] count number of participants --- sktalk/corpus/conversation.py | 28 ++++++++++------------------ tests/corpus/test_conversation.py | 18 ++++++------------ 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 0b90abf..2af1580 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -120,25 +120,17 @@ def _subconversation(self, "`time_or_index` must be either 'time' or 'index'") return Conversation(utterances=returned_utterances) - # TODO should metadata be part of this? - return Conversation(returned_utterances, self.metadata) - - def _time_to_next(self) -> int: - # if len(self.utterances) != 2: - # return None - try: - return self.utterances[0].until(self.utterances[1]) - except (TypeError, IndexError): - return None - - def _dyadic(self) -> bool: - participants = [u.participant for u in self.utterances] - return len(set(participants)) == 2 + def _count_participants(self) -> int: + """Count the number of participants in a conversation + + Importantly: if one of the utterances has no participant, it is counted + as a separate participant (None). - CONVERSATION_FUNCTIONS = { - "dyadic": _dyadic, - "time_to_next": _time_to_next, - } + Returns: + int: number of participants + """ + participants = [u.participant for u in self.utterances] + return len(set(participants)) def apply(self, field, **kwargs): """ diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index e90a284..b4cf81b 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -99,15 +99,9 @@ def test_overlap(self): assert not Conversation.overlap( 70, 80, [90, 110]) # utterance after window - def test_dyadic(self, convo): - assert not convo._dyadic() - convo2 = convo.subconversation(0, 2) - assert convo2._dyadic() - convo3 = convo.subconversation(0) - assert not convo3._dyadic() - - def test_apply_dyadic(self, convo): - convo.apply("dyadic", before=1) - assert convo.utterances[0].dyadic - assert convo.utterances[2].dyadic - assert not convo.utterances[8].dyadic + def test_count_participants(self, convo): + assert convo._count_participants() == 3 + convo2 = convo._subconversation(index=0, before=2) + assert convo2._count_participants() == 2 + convo3 = convo._subconversation(index=0) + assert convo3._count_participants() == 1 From b8ca53c43e45a0d46197c64d4ac1c49bfe1da912 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:05:29 +0100 Subject: [PATCH 30/53] calculate FTO --- sktalk/corpus/conversation.py | 67 ++++++++++++++++++++++++++++++----- sktalk/corpus/utterance.py | 21 +++++++++++ 2 files changed, 80 insertions(+), 8 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 2af1580..ac0f904 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -132,20 +132,71 @@ def _count_participants(self) -> int: participants = [u.participant for u in self.utterances] return len(set(participants)) - def apply(self, field, **kwargs): + def _update(self, field: str, values: list, **kwargs): """ - Apply a function to each utterance in the conversation + Update the all utterances in the conversation with calculated values + + This function also stores relevant arguments in the Conversation metadata. Args: - func (function): function to apply to each utterance - field (str): field to update + field (str): field of the Utterance to update + values (list): list of values to update each utterance with + kwargs (dict): information about the calculation to store in the Conversation metadata """ - func = self.CONVERSATION_FUNCTIONS[field] + if len(values) != len(self.utterances): + raise ValueError( + "The number of values must match the number of utterances") + try: + self._metadata["Calculations"].update(field=kwargs) + except KeyError: + self._metadata = {"Calculations": {field: kwargs}} + for index, utterance in enumerate(self.utterances): + utterance.__setattr__(field, values[index]) + + def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_participants: int = 2): + """Calculate Floor Transfer Offset (FTO) per utterance + + FTO is defined as the difference between the time that a turn starts and the + end of the most relevant prior turn by the other participant, which is not + necessarily the prior utterance. + An utterance does not receive an FTO if there are preceding utterances + within the window that do not have timing information, or if it lacks + timing information itself. + + To be a relevant prior turn, the following conditions must be met, respective to utterance U: + - the utterance must be by another speaker than U + - the utterance by the other speaker must be the most recent utterance by that speaker + - the utterance must have started before utterance U, more than `planning_buffer` ms before. + - the utterance must be partly or entirely within the context window (`window` ms prior to the start of utterance U) + - within the context window, there must be a maximum of `n_participants` speakers. + + Args: + window (int, optional): _description_. Defaults to 10000. + planning_buffer (int, optional): _description_. Defaults to 200. + n_participants (int, optional): _description_. Defaults to 2. + """ + values = [] for index, utterance in enumerate(self.utterances): - sub = self.subconversation(index=index, **kwargs) - value = func(sub) - utterance.__setattr__(field, value) + sub = self._subconversation( + index=index, + time_or_index="time", + before=window, + after=0) + if not 2 <= sub._count_participants() <= n_participants: + values.append(None) + continue + potentials = [ + u for u in sub.utterances if utterance._relevant_for_fto(u, planning_buffer)] + try: + relevant = potentials[-1] + values.append(utterance.until(relevant)) + except IndexError: + values.append(None) + self._update("FTO", values, + window=window, + planning_buffer=planning_buffer, + n_participants=n_participants) @staticmethod def overlap(begin: int, end: int, time: list): diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 6712674..0f7b082 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -59,6 +59,27 @@ def _clean_utterance(self): def until(self, next_utt): return next_utt.time[0] - self.time[1] + def _relevant_for_fto(self, prior_utt, planning_buffer: int): + """Assess whether an utterance is potentially relevant to calculate FTO + + An utterance is potentially relevant for fto calculation if: + - the utterance `prior_utt` must be by another speaker + - the utterance `prior_utt` must have started before the utterance itself, more than `planning_buffer` ms before. + + The planning buffer is the minimum time between a relevant preceding utterance and the utterance itself + + Args: + prior_utt (Utterance): utterance to assess + planning_buffer (int): buffer time (in ms) + + Returns: + bool: whether the utterance `prior_utt` meets the criteria and is potentially relevant for FTO calculation + """ + return ( + self.participant != prior_utt.participant + and self.time[0] - planning_buffer >= prior_utt.time[0] + ) + def _split_time(self): try: begin, end = self.time From 6a34489a4bacd0eb116579a2246e57b12a0a2148 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:10:24 +0100 Subject: [PATCH 31/53] remove old code --- sktalk/turndynamics.py | 303 ------------------------------------- tests/test_turndynamics.py | 0 2 files changed, 303 deletions(-) delete mode 100644 sktalk/turndynamics.py delete mode 100644 tests/test_turndynamics.py diff --git a/sktalk/turndynamics.py b/sktalk/turndynamics.py deleted file mode 100644 index 5d4252e..0000000 --- a/sktalk/turndynamics.py +++ /dev/null @@ -1,303 +0,0 @@ -import os -import glob -import math -import re -import datetime -import grapheme -import numpy as np -import pandas as pd -from collections import Counter - -from tqdm.autonotebook import tqdm -from joblib import Parallel, delayed - - -def readcorpus(filename, langshort=None, langfull=None): - """ returns a formatted language corpus with turn and transition measures - - :param a: filename of the corpus - :type a: string - :param b: short version of language name, defaults to none - :type b: string, optional - :param c: full version of language name, defaults to none - :type c: string, optional - - :return: formatted dataframe of the language corpus - """ - # convert time strings to the ISO 8601 time format hh:mm:ss.sss - def _converttime(text): - if pd.isna(text) == True: - return pd.NA - else: - h, m, s = text.split(':') - return int(datetime.timedelta(hours=int(h), - minutes=int(m), - seconds=float(s)).total_seconds()*1000) - - # number of unique sources - def _getsourceindex(source): - return n_sources.index(source) + 1 - - # talk, laugh, breath, or other conduct classification - def _getnature(utterance): - if pd.isna(utterance) == True: - return pd.NA - if utterance == '[laugh]': - return 'laugh' - if utterance == '[breath]': - return 'breath' - if utterance in ['[cough]', '[sneeze]', '[nod]', '[blow]', '[sigh]', - '[yawn]', '[sniff]', '[clearsthroat]', - '[lipsmack]', '[inhales]', '[groan]']: - return utterance - else: - return 'talk' - - # count number of characters - def _getnchar(utterance): - if pd.isna(utterance) == True: - return pd.NA - else: - utterance = Counter(utterance.replace(" ", "")) - return sum(utterance.values()) - - # create a 'window' for each utterance - # The window looks at 10s prior the begin of the current utterance (lookback) - # Only turns that begin within this lookback are included - # in the window. This means that if the prior turn began later - # than 10s before the current utterance, then the prior turn is - # not included in the window. - def _createwindow(begin, participant): - lookback = 10000 - lookfwd = 0 - filter = (df_transitions['begin'] >= ( - begin - lookback)) & (df_transitions['begin'] <= (begin + lookfwd)) - window = df_transitions.loc[filter] - # identify who produced the utterance - window['turnby'] = np.where(window['participant'] == participant, 'self', - 'other') - # calculate duration of all turns in window - stretch = window['end'].max() - window['begin'].min() - # calculate sum of all turn durations - talk_all = window['duration'].sum() - # calculate amount of talk produced by the participant in relation - # to the total amount of talk in the window - try: - talk_rel = window.loc[window['turnby'] == - 'self']['duration'].sum() / talk_all - except ZeroDivisionError: - talk_rel = pd.NA - # calculate amount of loading of the channel - # (1 = no empty space > overlap, < silences) - load = talk_all / stretch - # calculate total amount of turns in this time window - turns_all = len(window.index) - # calculate amount of turns by this participant relative to turns by others - try: - turns_rel = ( - len(window[window['turnby'] == 'self'].index)) / turns_all - except ZeroDivisionError: - turns_rel = pd.NA - - participants = window['participant'].nunique() - # create list of all measures computed - measures = [talk_all, talk_rel, load, - turns_all, turns_rel, participants] - return measures - - df = pd.read_csv(filename) - filename = re.sub('.csv', "", filename) - filename = re.sub('\.\/ElPaCo Dataset\/', '', filename) - # filename = re.sub('ElPaCo dataset\/', '', filename) - df['language'] = re.sub("[0-9]", "", filename) - if langshort is not None: - df['langshort'] = langshort - else: - df['langshort'] = df['language'] - if langfull is not None: - df['langfull'] = langfull - else: - df['langfull'] = df['language'] - df['corpus'] = filename - df['begin'] = df['begin'].apply(_converttime) - df['end'] = df['end'].apply(_converttime) - - # calculate duration of the turn - df['duration'] = df['end'] - df['begin'] - - # define improbably long (more than 40 seconds) and negative durations - n_weird_durations = df.loc[( - (df['duration'] > 40000) | (df['duration'] < 0))] - - # set weird durations to NA under the ff columns: begin, end, and duration - df.loc[(df['duration'] > 40000) | ( - df['duration'] < 0), ['duration']] = pd.NA - df.loc[(df['duration'] > 40000) | (df['duration'] < 0), ['end']] = pd.NA - df.loc[(df['duration'] > 40000) | (df['duration'] < 0), ['begin']] = pd.NA - - # create UID - - # list of unique sources in the corpus - n_sources = df['source'].unique().tolist() - # length of the number of sources (i.e. 20 sources = 2 chars), for padding - x = len(str(len(n_sources))) - # length of the number of turns in a source - # (i.e. 100 conversations = 3 chars), for padding - y = len(str(len(df.groupby(['source', 'utterance']).size()))) - - # UID format: language-source number-turn number (within a source) - uidbegin = np.where(pd.isna(df['begin']) == - True, 'NA', df['begin'].astype(str)) - df['uid'] = df['language'] + '-' + (df['source'].apply(_getsourceindex)).astype(str).str.zfill( - x) + '-' + (df.groupby(['source']).cumcount() + 1).astype(str).str.zfill(y) + '-' + uidbegin - - # deal with "unknown utterance" content - na_strings = ['[unk_utterance', '[unk_noise]', '[distortion]', - '[background]', '[background] M', '[static]', 'untranscribed', - '[noise]', '[inintel]', '[distorted]', 'tlyam kanəw'] - - # set unknown utterances to NA - df.loc[(df['utterance'].isin(na_strings)), ['utterance']] = pd.NA - n_unknown = df['utterance'][df['utterance'].isin(na_strings)].count() - - # get nature of utterance - df['nature'] = df['utterance'].apply(_getnature) - - # create a stripped version of the utterance - df['utterance_stripped'] = df['utterance'].str.strip() - df['utterance_stripped'] = df['utterance_stripped'].str.replace(r'\[[^[]*\]', - '', regex=True) - df['utterance_stripped'] = df['utterance_stripped'].str.replace(r'[\\(\\)]+', - '', regex=True) - # set blank utterances to NA - df.loc[df['utterance_stripped'] == '', 'utterance_stripped'] = pd.NA - - # measure number of words by counting spaces - df['nwords'] = df['utterance_stripped'].str.count(' ') + 1 - - # measure number of characters - df['nchar'] = df['utterance_stripped'].apply(_getnchar) # .astype(float) - - # add turn and frequency rank measures - - # create a new dataframe without NA utterances (for easier calculations) - df_ranking = df.dropna(subset=['utterance_stripped']) - # count how frequent the utterance occurs in the corpus - df_ranking['n'] = df_ranking.groupby( - 'utterance')['utterance'].transform('count').astype(float) - # rank the frequency of the utterance - df_ranking['rank'] = df_ranking['n'].rank(method='dense', ascending=False) - # calculate total number of uttrances - df_ranking['total'] = df_ranking['n'].sum() - # calculate frequency of utterance in relation to the total number of utterances - df_ranking['frequency'] = df_ranking['n'] / df_ranking['total'] - # merge the new dataframe with the original dataframe - df = pd.merge(df, df_ranking) - - # categorize overlap, look at overlap with turns up to four positions down - # overlap can either be full or partial - # set to NA if no overlap is found - df['overlap'] = np.where((df['begin'] > df['begin'].shift(1)) & (df['end'] < df['end'].shift(1)) | - (df['begin'] > df['begin'].shift(2)) & (df['end'] < df['end'].shift(2)) | - (df['begin'] > df['begin'].shift(3)) & (df['end'] < df['end'].shift(3)) | - (df['begin'] > df['begin'].shift(4)) & ( - df['end'] < df['end'].shift(4)), - 'full', np.where((df['begin'] > df['begin'].shift()) & (df['begin'] <= df['end'].shift()), - 'partial', pd.NA)) - - # identify who produced the prior utterance: other, self, - # or self during other (if previous utterance by the same participant - # was fully overlapped by an utterance of a different pariticpant) - # the priorby of the first utterance in the corpus is set to NA - df['priorby'] = np.where(df['participant'].index == 0, pd.NA, - np.where(df['participant'] != df['participant'].shift(), - 'other', np.where((df['overlap'].shift() == 'full') & - (df['participant'].shift( - ) == df['participant']), - 'self_during_other', 'self' - ))) - - # calculate FTO (Flow Time Overlap) - # This refers to the duration of the overlap between the current utterance - # and the most relevant prior turn by other, which is not necessatily the - # prior row in the df. By default we only get 0, 1 and 5 right. Cases 2 - # and 3 are covered by a rule that looks at turns coming in early for which - # prior turn is by self but T-2 is by other. Some cases of 4 (but not all is - # covered by looking for turns that do not come in early but have a prior - # turn in overlap and look for the turn at T-2 by a different participant. - - # A turn doesn't receive an FTO if it follows a row in the db that doesn't - # have timing information, or if it is such a row. - - # A [------------------] [0--] - # B [1-] [2--] [3--] [4--] [5--] - - df['FTO'] = np.where((df['priorby'] == 'other') & (df['begin'] - df['begin'].shift() < 200) & - (df['priorby'].shift() != 'other'), df['begin'] - - df['end'].shift(2), - np.where((df['priorby'] == 'other') & - (df['begin'] - df['begin'].shift() < 200) & - (df['priorby'].shift() != 'self') & - df['priorby'].shift(2) == 'other', - df['begin'] - df['end'].shift(3), - np.where((df['priorby'] == 'self_during_other') & - (df['participant'].shift( - 2) != df['participant']), - df['begin'] - df['end'].shift(2), - np.where((df['priorby'] == 'self_during_other') & - (df['priorby'].shift() - == 'self_during_other'), - df['begin'] - - df['end'].shift(3), - np.where(df['priorby'] == 'other', - df['begin'] - - df['end'].shift(), - np.where(df['priorby'] == 'self', pd.NA, pd.NA - )))))) - - # identify whther a turn is overlapped by what succeeds it - # if not, set to NA - df['overlapped'] = np.where((df['begin'] < df['begin'].shift(-1)) & - (df['end'] > df['begin'].shift(-1)), 'overlapped', pd.NA) - - # set FTO to NA if it is higher than 10s or lower than -10s, on the - # grounds that (a) psycholinguistically it is implausible that these - # actually relate to the end of the 'prior', and (b) conversation - # analytically it is necessary to treat such cases on their - # own terms rather than take an FTO at face value - - df['FTO'] = np.where(df['FTO'] > 9999, pd.NA, np.where( - df['FTO'] < -9999, pd.NA, df['FTO'])) - # set FTO to NA if it is negative < -99999, on the - # grounds that (a) psycholinguistically it is - # impossible to relate to the end of the 'prior' turn, - # and (b) conversation analytically it is necessary - # to treat such cases on their own terms rather than - # take an FTO at face value - - # add transitions metadata - - # create new dataframe with only the relevant columns - df_transitions = df.copy() - df_transitions = df_transitions.drop(columns=['langshort', 'langfull', - 'corpus', 'nature', - 'utterance_stripped', - 'nwords', 'nchar', 'n', - 'rank', 'total', - 'frequency', 'overlap']) - - # put all the calculated transition measures into one column - df['transitions'] = df.apply(lambda x: _createwindow(x['begin'], - x['participant']), - axis=1) - - # split the list into six columns, one column representing each measure - df_split = pd.DataFrame(df['transitions'].tolist(), columns=['talk_all', 'talk_rel', 'load', - 'turns_all', 'turns_rel', 'participants']) - - # add transition measures to original df - df = pd.concat([df, df_split], axis=1) - # drop column containing list of transition measures - df = df.drop(columns='transitions') - - return df diff --git a/tests/test_turndynamics.py b/tests/test_turndynamics.py deleted file mode 100644 index e69de29..0000000 From c5c34302e70d391fe5141ac4a439ad33f6e7f95f Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:19:35 +0100 Subject: [PATCH 32/53] address linter comments --- sktalk/corpus/conversation.py | 8 ++++---- sktalk/corpus/utterance.py | 2 +- tests/corpus/test_conversation.py | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index ac0f904..6fd8be3 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -120,7 +120,7 @@ def _subconversation(self, "`time_or_index` must be either 'time' or 'index'") return Conversation(utterances=returned_utterances) - def _count_participants(self) -> int: + def count_participants(self) -> int: """Count the number of participants in a conversation Importantly: if one of the utterances has no participant, it is counted @@ -183,11 +183,11 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti time_or_index="time", before=window, after=0) - if not 2 <= sub._count_participants() <= n_participants: + if not 2 <= sub.count_participants() <= n_participants: values.append(None) continue potentials = [ - u for u in sub.utterances if utterance._relevant_for_fto(u, planning_buffer)] + u for u in sub.utterances if utterance.relevant_for_fto(u, planning_buffer)] try: relevant = potentials[-1] values.append(utterance.until(relevant)) @@ -206,6 +206,6 @@ def overlap(begin: int, end: int, time: list): # time[0] is before begin and time[1] is after end if time is None: return False - elif begin <= time[0] <= end or begin <= time[1] <= end: + if begin <= time[0] <= end or begin <= time[1] <= end: return True return time[0] <= begin and time[1] >= end diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 0f7b082..f031812 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -59,7 +59,7 @@ def _clean_utterance(self): def until(self, next_utt): return next_utt.time[0] - self.time[1] - def _relevant_for_fto(self, prior_utt, planning_buffer: int): + def relevant_for_fto(self, prior_utt, planning_buffer: int): """Assess whether an utterance is potentially relevant to calculate FTO An utterance is potentially relevant for fto calculation if: diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index b4cf81b..56ae6ad 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -61,7 +61,7 @@ class TestConversationMetrics: def test_subconversation_errors(self, convo, args, error): index, before, after, time_or_index = args with error: - convo._subconversation(index=index, + convo._subconversation(index=index, #noqa W0212 before=before, after=after, time_or_index=time_or_index) @@ -78,7 +78,7 @@ def test_subconversation_errors(self, convo, args, error): ]) def test_subconversation(self, convo, args, expected_length): index, before, after, time_or_index = args - sub = convo._subconversation(index=index, + sub = convo._subconversation(index=index, #noqa W0212 before=before, after=after, time_or_index=time_or_index) @@ -100,8 +100,8 @@ def test_overlap(self): 70, 80, [90, 110]) # utterance after window def test_count_participants(self, convo): - assert convo._count_participants() == 3 - convo2 = convo._subconversation(index=0, before=2) - assert convo2._count_participants() == 2 - convo3 = convo._subconversation(index=0) - assert convo3._count_participants() == 1 + assert convo.count_participants() == 3 + convo2 = convo._subconversation(index=0, before=2) #noqa W0212 + assert convo2.count_participants() == 2 + convo3 = convo._subconversation(index=0) #noqa W0212 + assert convo3.count_participants() == 1 From 780206328fb62bdad0c982c93eb9473f02211dba Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:37:59 +0100 Subject: [PATCH 33/53] address linter issues and update fto calculation --- sktalk/corpus/conversation.py | 4 ++-- sktalk/corpus/utterance.py | 1 + tests/corpus/test_conversation.py | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 6fd8be3..39f241b 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -151,7 +151,7 @@ def _update(self, field: str, values: list, **kwargs): except KeyError: self._metadata = {"Calculations": {field: kwargs}} for index, utterance in enumerate(self.utterances): - utterance.__setattr__(field, values[index]) + setattr(utterance, field, values[index]) def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_participants: int = 2): """Calculate Floor Transfer Offset (FTO) per utterance @@ -190,7 +190,7 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti u for u in sub.utterances if utterance.relevant_for_fto(u, planning_buffer)] try: relevant = potentials[-1] - values.append(utterance.until(relevant)) + values.append(relevant.until(utterance)) except IndexError: values.append(None) self._update("FTO", values, diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index f031812..eda3024 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -20,6 +20,7 @@ class Utterance: n_characters: Optional[int] = None time_to_next: Optional[int] = None dyadic: Optional[bool] = None + FTO: Optional[int] = None def __post_init__(self): # clean utterance: diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 56ae6ad..6de3a7b 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -61,7 +61,7 @@ class TestConversationMetrics: def test_subconversation_errors(self, convo, args, error): index, before, after, time_or_index = args with error: - convo._subconversation(index=index, #noqa W0212 + convo._subconversation(index=index, #noqa protected-access before=before, after=after, time_or_index=time_or_index) @@ -78,7 +78,7 @@ def test_subconversation_errors(self, convo, args, error): ]) def test_subconversation(self, convo, args, expected_length): index, before, after, time_or_index = args - sub = convo._subconversation(index=index, #noqa W0212 + sub = convo._subconversation(index=index, #noqa protected-access before=before, after=after, time_or_index=time_or_index) @@ -101,7 +101,7 @@ def test_overlap(self): def test_count_participants(self, convo): assert convo.count_participants() == 3 - convo2 = convo._subconversation(index=0, before=2) #noqa W0212 + convo2 = convo._subconversation(index=0, before=2) #noqa protected-access assert convo2.count_participants() == 2 - convo3 = convo._subconversation(index=0) #noqa W0212 + convo3 = convo._subconversation(index=0) #noqa protected-access assert convo3.count_participants() == 1 From 3bbeb001bdc9d7f6bab108ba52b4bdfb35d7b9d5 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:40:37 +0100 Subject: [PATCH 34/53] fix noqa --- tests/corpus/test_conversation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 6de3a7b..04d3504 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -61,7 +61,7 @@ class TestConversationMetrics: def test_subconversation_errors(self, convo, args, error): index, before, after, time_or_index = args with error: - convo._subconversation(index=index, #noqa protected-access + convo._subconversation(index=index, # noqa protected-access before=before, after=after, time_or_index=time_or_index) @@ -78,7 +78,7 @@ def test_subconversation_errors(self, convo, args, error): ]) def test_subconversation(self, convo, args, expected_length): index, before, after, time_or_index = args - sub = convo._subconversation(index=index, #noqa protected-access + sub = convo._subconversation(index=index, # noqa W0212 before=before, after=after, time_or_index=time_or_index) From cc91bbcfe849a2da31adb54779991172b08bed08 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 17:42:51 +0100 Subject: [PATCH 35/53] fix noqa --- tests/corpus/test_conversation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 04d3504..b1121b9 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -61,7 +61,7 @@ class TestConversationMetrics: def test_subconversation_errors(self, convo, args, error): index, before, after, time_or_index = args with error: - convo._subconversation(index=index, # noqa protected-access + convo._subconversation(index=index, # noqa W0212 before=before, after=after, time_or_index=time_or_index) @@ -101,7 +101,7 @@ def test_overlap(self): def test_count_participants(self, convo): assert convo.count_participants() == 3 - convo2 = convo._subconversation(index=0, before=2) #noqa protected-access + convo2 = convo._subconversation(index=0, before=2) # noqa W0212 assert convo2.count_participants() == 2 - convo3 = convo._subconversation(index=0) #noqa protected-access + convo3 = convo._subconversation(index=0) # noqa W0212 assert convo3.count_participants() == 1 From 61a195098e4a9cde23a325eec8938060d513c409 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 21 Nov 2023 23:17:29 +0100 Subject: [PATCH 36/53] calculations update in metadata corrected --- sktalk/corpus/conversation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 39f241b..5fc890f 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -118,7 +118,7 @@ def _subconversation(self, else: raise ValueError( "`time_or_index` must be either 'time' or 'index'") - return Conversation(utterances=returned_utterances) + return Conversation(utterances=returned_utterances) #TODO suppress warning about empty utterances def count_participants(self) -> int: """Count the number of participants in a conversation @@ -146,10 +146,11 @@ def _update(self, field: str, values: list, **kwargs): if len(values) != len(self.utterances): raise ValueError( "The number of values must match the number of utterances") + metadata = {field: kwargs} try: - self._metadata["Calculations"].update(field=kwargs) + self._metadata["Calculations"].update(metadata) except KeyError: - self._metadata = {"Calculations": {field: kwargs}} + self._metadata = {"Calculations": metadata} for index, utterance in enumerate(self.utterances): setattr(utterance, field, values[index]) From f79b5eaaeea50081e1997ced5849a30ac49d47c6 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 24 Nov 2023 16:26:02 +0100 Subject: [PATCH 37/53] allow warning supppression on empty conversations inside subconversation --- sktalk/corpus/conversation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 5fc890f..14e9da9 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -6,7 +6,7 @@ class Conversation(Writer): def __init__( - self, utterances: list["Utterance"], metadata: Optional[dict] = None # noqa: F821 + self, utterances: list["Utterance"], metadata: Optional[dict] = None, suppress_warnings: bool = False # noqa: F821 ) -> None: """Representation of a transcribed conversation @@ -27,7 +27,7 @@ def __init__( if not isinstance(utterance, Utterance): raise TypeError(errormsg) # The list can be empty. This would be weird and the user needs to be warned. - if not self._utterances: + if not self._utterances and not suppress_warnings: warnings.warn( "This conversation appears to be empty: no Utterances are read.") @@ -114,11 +114,11 @@ def _subconversation(self, returned_utterances = [ u for u in self.utterances if self.overlap(begin, end, u.time)] except (TypeError, IndexError): - return Conversation([], self.metadata) + return Conversation([], suppress_warnings=True) else: raise ValueError( "`time_or_index` must be either 'time' or 'index'") - return Conversation(utterances=returned_utterances) #TODO suppress warning about empty utterances + return Conversation(utterances=returned_utterances) def count_participants(self) -> int: """Count the number of participants in a conversation From d4c98803f6824eb75613c70953fa744eaeda57d8 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 24 Nov 2023 16:26:52 +0100 Subject: [PATCH 38/53] refer to hidden _utterances instead of property --- sktalk/corpus/conversation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 14e9da9..c1efcf5 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -96,7 +96,7 @@ def _subconversation(self, Conversation: Conversation object containing a reduced set of utterances """ # TODO consider adding parameter 'strict' that only returns utterances entirely inside the window - if index < 0 or index >= len(self.utterances): + if index < 0 or index >= len(self._utterances): raise IndexError("Index out of range") if after is None: after = before @@ -104,15 +104,15 @@ def _subconversation(self, # if before/after would exceed the bounds of the list, adjust if index - before < 0: before = index - if index + after + 1 > len(self.utterances): - after = len(self.utterances) - index - 1 - returned_utterances = self.utterances[index-before:index+after+1] + if index + after + 1 > len(self._utterances): + after = len(self._utterances) - index - 1 + returned_utterances = self._utterances[index-before:index+after+1] elif time_or_index == "time": try: - begin = self.utterances[index].time[0] - before - end = self.utterances[index].time[1] + after + begin = self._utterances[index].time[0] - before + end = self._utterances[index].time[1] + after returned_utterances = [ - u for u in self.utterances if self.overlap(begin, end, u.time)] + u for u in self._utterances if self.overlap(begin, end, u.time)] except (TypeError, IndexError): return Conversation([], suppress_warnings=True) else: From 20c65db3af621473adc630754be62246676362a6 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 24 Nov 2023 16:27:37 +0100 Subject: [PATCH 39/53] allow participant counting to exclude None --- sktalk/corpus/conversation.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index c1efcf5..ca724f7 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -120,16 +120,22 @@ def _subconversation(self, "`time_or_index` must be either 'time' or 'index'") return Conversation(utterances=returned_utterances) - def count_participants(self) -> int: + def count_participants(self, except_none: bool = False) -> int: """Count the number of participants in a conversation Importantly: if one of the utterances has no participant, it is counted - as a separate participant (None). + as a separate participant (None). If you want to exclude these, set + `except_none` to True. + + Args: + except_none (bool, optional): if `True`, utterances without a participant are not counted. Defaults to `False`. Returns: int: number of participants """ participants = [u.participant for u in self.utterances] + if except_none: + participants = [p for p in participants if p is not None] return len(set(participants)) def _update(self, field: str, values: list, **kwargs): From 00ad82a50f4499cb7226582bcf7572919b83e852 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 24 Nov 2023 16:27:55 +0100 Subject: [PATCH 40/53] add test for FTO calculation --- tests/conftest.py | 2 +- tests/corpus/test_conversation.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 857d59e..b7d9ab5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,7 +57,7 @@ def convo_utts(): ), Utterance( utterance="Monde F", - participant="B", + participant="C", time=[5500, 7500] ), Utterance( diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index b1121b9..2baf4e3 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -105,3 +105,13 @@ def test_count_participants(self, convo): assert convo2.count_participants() == 2 convo3 = convo._subconversation(index=0) # noqa W0212 assert convo3.count_participants() == 1 + + def test_calculate_FTO(self, convo): + convo.calculate_FTO() + assert convo.metadata["Calculations"]["FTO"] == { + "window": 10000, "planning_buffer": 200, "n_participants": 2} + convo.calculate_FTO(window=10) + assert convo.metadata["Calculations"]["FTO"] == { + "window": 10, "planning_buffer": 200, "n_participants": 2} + assert convo.utterances[0].FTO is None + assert convo.utterances[1].FTO == -100 From 90c4ac6286ec1895b476f6282cfb6b505971e100 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 24 Nov 2023 18:09:23 +0100 Subject: [PATCH 41/53] ensure participant count does not include future utterances --- sktalk/corpus/conversation.py | 45 ++++++++++++++++++++++--------- tests/conftest.py | 22 +++++++-------- tests/corpus/test_conversation.py | 6 ++++- 3 files changed, 48 insertions(+), 25 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index ca724f7..7108f90 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -74,6 +74,7 @@ def _subconversation(self, index: int, before: int = 0, after: Optional[int] = None, + exclude_utterance_overlap: bool = False, time_or_index: str = "index") -> "Conversation": """Select utterances to provide context as a sub-conversation @@ -84,6 +85,10 @@ def _subconversation(self, after (int, optional): Either the number of utterances after the indicated utterance, or the time in ms following the utterance's end. Defaults to None, which then assumes the same value as `before`. + exclude_utterance_overlap (bool, optional): Only used when `time_or_index` is "time", + and either `before` or `after` is 0. If True, the duration of the + utterance itself is not used to identify overlapping utterances, and only + the window before or after the utterance is used. Defaults to False. time_or_index (str, optional): Use "time" to select based on time (in ms), or "index" to select a set number of utterances irrespective of timing. Defaults to "index". @@ -108,17 +113,26 @@ def _subconversation(self, after = len(self._utterances) - index - 1 returned_utterances = self._utterances[index-before:index+after+1] elif time_or_index == "time": - try: - begin = self._utterances[index].time[0] - before - end = self._utterances[index].time[1] + after - returned_utterances = [ - u for u in self._utterances if self.overlap(begin, end, u.time)] - except (TypeError, IndexError): - return Conversation([], suppress_warnings=True) + returned_utterances = self._subconversation_by_time( + index, before, after, exclude_utterance_overlap) else: raise ValueError( "`time_or_index` must be either 'time' or 'index'") - return Conversation(utterances=returned_utterances) + return Conversation(utterances=returned_utterances, suppress_warnings=True) + + def _subconversation_by_time(self, index, before, after, exclude_utterance_overlap): + try: + begin = self._utterances[index].time[0] - before + end = self._utterances[index].time[1] + after + if exclude_utterance_overlap and before == 0: # only overlap with window following utterance + begin = self._utterances[index].time[1] + elif exclude_utterance_overlap and after == 0: # only overlap with window preceding utterance + end = self._utterances[index].time[0] + returned_utterances = [ + u for u in self._utterances if self.overlap(begin, end, u.time) or u == self._utterances[index]] + except (TypeError, IndexError): + return [] + return returned_utterances def count_participants(self, except_none: bool = False) -> int: """Count the number of participants in a conversation @@ -175,13 +189,17 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti - the utterance must be by another speaker than U - the utterance by the other speaker must be the most recent utterance by that speaker - the utterance must have started before utterance U, more than `planning_buffer` ms before. - - the utterance must be partly or entirely within the context window (`window` ms prior to the start of utterance U) + - the utterance must be partly or entirely within the context window (`window` ms prior + to the start of utterance U) - within the context window, there must be a maximum of `n_participants` speakers. Args: - window (int, optional): _description_. Defaults to 10000. - planning_buffer (int, optional): _description_. Defaults to 200. - n_participants (int, optional): _description_. Defaults to 2. + window (int, optional): the time in ms prior to utterance in which a + relevant preceding utterance can be found. Defaults to 10000. + planning_buffer (int, optional): minimum speaking time in ms to allow for a response. + Defaults to 200. + n_participants (int, optional): maximum number of participants overlapping with + the utterance and preceding window. Defaults to 2. """ values = [] for index, utterance in enumerate(self.utterances): @@ -189,7 +207,8 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti index=index, time_or_index="time", before=window, - after=0) + after=0, + exclude_utterance_overlap=True) if not 2 <= sub.count_participants() <= n_participants: values.append(None) continue diff --git a/tests/conftest.py b/tests/conftest.py index b7d9ab5..d574f30 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,52 +26,52 @@ def convo_meta(): def convo_utts(): return [ Utterance( - utterance="Hello A", + utterance="0 utterance A", participant="A", time=[0, 1000] ), Utterance( - utterance="Monde B", + utterance="1 utterance B", participant="B", time=[900, 3500] ), Utterance( - utterance="Hello C", + utterance="2 utterance C", participant="A", time=[1001, 12000] ), Utterance( - utterance="Monde D", + utterance="3 utterance D", participant="B", time=[1200, 1999] ), Utterance( - utterance="Hello E", + utterance="4 utterance E", participant="A", time=[3500, 4500] ), Utterance( - utterance="Utterance U", + utterance="5 utterance U", participant="B", time=[5000, 8000] ), Utterance( - utterance="Monde F", + utterance="6 utterance F", participant="C", time=[5500, 7500] ), Utterance( - utterance="Hello G", - participant="A", + utterance="7 utterance G", + participant=None, time=None ), Utterance( - utterance="Monde H", + utterance="8 utterance H", participant="B", time=[9000, 12500] ), Utterance( - utterance="Hello I", + utterance="9 utterance I", participant="C", time=[12000, 13000] ) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 2baf4e3..88fee8f 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -100,7 +100,8 @@ def test_overlap(self): 70, 80, [90, 110]) # utterance after window def test_count_participants(self, convo): - assert convo.count_participants() == 3 + assert convo.count_participants() == 4 + assert convo.count_participants(except_none=True) == 3 convo2 = convo._subconversation(index=0, before=2) # noqa W0212 assert convo2.count_participants() == 2 convo3 = convo._subconversation(index=0) # noqa W0212 @@ -115,3 +116,6 @@ def test_calculate_FTO(self, convo): "window": 10, "planning_buffer": 200, "n_participants": 2} assert convo.utterances[0].FTO is None assert convo.utterances[1].FTO == -100 + assert convo.utterances[2].FTO == None + convo.calculate_FTO(planning_buffer=0) + assert convo.utterances[2].FTO == -2499 From 69912f6a02125ddf6c66894c9593d44ed96a1196 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 28 Nov 2023 10:55:21 +0100 Subject: [PATCH 42/53] split subconversation into two functions --- sktalk/corpus/conversation.py | 78 +++++++++++++++---------------- tests/corpus/test_conversation.py | 51 +++++++++++--------- 2 files changed, 68 insertions(+), 61 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 7108f90..a34574b 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -70,57 +70,58 @@ def asdict(self): """ return self._metadata | {"Utterances": [u.asdict() for u in self._utterances]} - def _subconversation(self, - index: int, - before: int = 0, - after: Optional[int] = None, - exclude_utterance_overlap: bool = False, - time_or_index: str = "index") -> "Conversation": + def _subconversation_by_index(self, + index: int, + before: int = 0, + after: Optional[int] = None) -> "Conversation": """Select utterances to provide context as a sub-conversation Args: index (int): The index of the utterance for which to provide context - before (int, optional): Either the number of utterances prior to indicated utterance, - or the time in ms preceding the utterance's begin. Defaults to 0. - after (int, optional): Either the number of utterances after the indicated utterance, - or the time in ms following the utterance's end. Defaults to None, - which then assumes the same value as `before`. - exclude_utterance_overlap (bool, optional): Only used when `time_or_index` is "time", - and either `before` or `after` is 0. If True, the duration of the - utterance itself is not used to identify overlapping utterances, and only - the window before or after the utterance is used. Defaults to False. - time_or_index (str, optional): Use "time" to select based on time (in ms), or "index" - to select a set number of utterances irrespective of timing. - Defaults to "index". + before (int, optional): The number of utterances prior to indicated utterance. Defaults to 0. + after (int, optional): The number of utterances after the indicated utterance. Defaults to None, + which then assumes the same value as `before`. Raises: IndexError: Index provided must be within range of utterances - ValueError: time_or_index must be either "time" or "index" Returns: - Conversation: Conversation object containing a reduced set of utterances + Conversation: Conversation object without metadata, containing a reduced set of utterances """ - # TODO consider adding parameter 'strict' that only returns utterances entirely inside the window if index < 0 or index >= len(self._utterances): raise IndexError("Index out of range") if after is None: after = before - if time_or_index == "index": - # if before/after would exceed the bounds of the list, adjust - if index - before < 0: - before = index - if index + after + 1 > len(self._utterances): - after = len(self._utterances) - index - 1 - returned_utterances = self._utterances[index-before:index+after+1] - elif time_or_index == "time": - returned_utterances = self._subconversation_by_time( - index, before, after, exclude_utterance_overlap) - else: - raise ValueError( - "`time_or_index` must be either 'time' or 'index'") + if index - before < 0: + before = index + if index + after + 1 > len(self._utterances): + after = len(self._utterances) - index - 1 + returned_utterances = self._utterances[index-before:index+after+1] return Conversation(utterances=returned_utterances, suppress_warnings=True) - def _subconversation_by_time(self, index, before, after, exclude_utterance_overlap): + def _subconversation_by_time(self, + index: int, + before: int = 0, + after: Optional[int] = None, + exclude_utterance_overlap: bool = False) -> "Conversation": + """Select utterances to provide context as a sub-conversation + + Args: + index (int): The index of the utterance for which to provide context + before (int, optional): The time in ms preceding the utterance's begin. Defaults to 0. + after (int, optional): The time in ms following the utterance's end. Defaults to None, + which then assumes the same value as `before`. + exclude_utterance_overlap (bool, optional): If True, the duration of the + utterance itself is not used to identify overlapping utterances, and only + the window before or after the utterance is used. Defaults to False. + + Returns: + Conversation: Conversation object without metadata, containing a reduced set of utterances + """ + if index < 0 or index >= len(self._utterances): + raise IndexError("Index out of range") + if after is None: + after = before try: begin = self._utterances[index].time[0] - before end = self._utterances[index].time[1] + after @@ -131,8 +132,8 @@ def _subconversation_by_time(self, index, before, after, exclude_utterance_overl returned_utterances = [ u for u in self._utterances if self.overlap(begin, end, u.time) or u == self._utterances[index]] except (TypeError, IndexError): - return [] - return returned_utterances + returned_utterances = [] + return Conversation(utterances=returned_utterances, suppress_warnings=True) def count_participants(self, except_none: bool = False) -> int: """Count the number of participants in a conversation @@ -203,9 +204,8 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti """ values = [] for index, utterance in enumerate(self.utterances): - sub = self._subconversation( + sub = self._subconversation_by_time( index=index, - time_or_index="time", before=window, after=0, exclude_utterance_overlap=True) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 88fee8f..5b5b438 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -52,36 +52,43 @@ def test_write_json(self, convo, tmp_path, user_path, expected_path): class TestConversationMetrics: @pytest.mark.parametrize("args, error", [ - ([0, 0, 1, "index"], does_not_raise()), - ([20, 1, 1, "index"], pytest.raises(IndexError)), - ([0, 50, 50, "index"], does_not_raise()), - ([0, 0, 0, "neither_time_nor_index"], - pytest.raises(ValueError)) + ([0, 0, 1], does_not_raise()), + ([20, 1, 1], pytest.raises(IndexError)), + ([0, 50, 50], does_not_raise()) ]) def test_subconversation_errors(self, convo, args, error): - index, before, after, time_or_index = args + index, before, after = args with error: - convo._subconversation(index=index, # noqa W0212 + convo._subconversation_by_index(index=index, # noqa W0212 before=before, - after=after, - time_or_index=time_or_index) + after=after) @pytest.mark.parametrize("args, expected_length", [ - ([0, 0, 1, "index"], 2), - ([5, 2, 0, "index"], 3), - ([0, 2, 2, "index"], 3), - ([0, 2, None, "index"], 3), - ([0, 0, 0, "time"], 2), # A, B - ([5, 3000, 3000, "time"], 6), # B,C,E,U,F,H - ([5, 0, 0, "time"], 3), # C, U, F + ([0, 0, 1], 2), + ([5, 2, 0], 3), + ([0, 2, 2], 3), + ([0, 2, None], 3) + ]) + def test_subconversation_index(self, convo, args, expected_length): + index, before, after = args + sub = convo._subconversation_by_index(index=index, # noqa W0212 + before=before, + after=after) + assert isinstance(sub, Conversation) + assert len(sub.utterances) == expected_length + + @pytest.mark.parametrize("args, expected_length", + [ + ([0, 0, 0], 2), # A, B + ([5, 3000, 3000], 6), # B,C,E,U,F,H + ([5, 0, 0], 3), # C, U, F ]) def test_subconversation(self, convo, args, expected_length): - index, before, after, time_or_index = args - sub = convo._subconversation(index=index, # noqa W0212 + index, before, after = args + sub = convo._subconversation_by_time(index=index, # noqa W0212 before=before, - after=after, - time_or_index=time_or_index) + after=after) assert isinstance(sub, Conversation) assert len(sub.utterances) == expected_length @@ -102,9 +109,9 @@ def test_overlap(self): def test_count_participants(self, convo): assert convo.count_participants() == 4 assert convo.count_participants(except_none=True) == 3 - convo2 = convo._subconversation(index=0, before=2) # noqa W0212 + convo2 = convo._subconversation_by_index(index=0, before=2) # noqa W0212 assert convo2.count_participants() == 2 - convo3 = convo._subconversation(index=0) # noqa W0212 + convo3 = convo._subconversation_by_index(index=0) # noqa W0212 assert convo3.count_participants() == 1 def test_calculate_FTO(self, convo): From d62a868b014cc9333280725e8e0c7b81b53aa20b Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 28 Nov 2023 10:57:25 +0100 Subject: [PATCH 43/53] fix linter issue --- tests/corpus/test_conversation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 5b5b438..5aefcf4 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -123,6 +123,6 @@ def test_calculate_FTO(self, convo): "window": 10, "planning_buffer": 200, "n_participants": 2} assert convo.utterances[0].FTO is None assert convo.utterances[1].FTO == -100 - assert convo.utterances[2].FTO == None + assert convo.utterances[2].FTO is None convo.calculate_FTO(planning_buffer=0) assert convo.utterances[2].FTO == -2499 From 9229e8a49ce98b423944134ec0bd95adac8856a1 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Tue, 28 Nov 2023 11:25:56 +0100 Subject: [PATCH 44/53] update example notebook --- docs/notebooks/example.ipynb | 110 +++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 12 deletions(-) diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index 06344d6..09c1268 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -63,7 +63,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -92,16 +92,16 @@ { "data": { "text/plain": [ - "[Utterance(utterance='0', participant='S', time=(0, 1500), begin='00:00:00.000', end='00:00:01.500', metadata=None),\n", - " Utterance(utterance=\"mm I'm glad I saw you⇗\", participant='S', time=(1500, 2775), begin='00:00:01.500', end='00:00:02.775', metadata=None),\n", - " Utterance(utterance=\"I thought I'd lost you (0.3)\", participant='S', time=(2775, 3773), begin='00:00:02.775', end='00:00:03.773', metadata=None),\n", - " Utterance(utterance=\"⌈no I've been here for a whi:le⌉,\", participant='H', time=(4052, 5515), begin='00:00:04.052', end='00:00:05.515', metadata=None),\n", - " Utterance(utterance='⌊xxx⌋ (0.3)', participant='S', time=(4052, 5817), begin='00:00:04.052', end='00:00:05.817', metadata=None),\n", - " Utterance(utterance=\"⌊hm:: (.) if ʔI couldn't boʔrrow, (1.3) the second (0.2) book of readings fo:r\", participant='S', time=(6140, 9487), begin='00:00:06.140', end='00:00:09.487', metadata=None),\n", - " Utterance(utterance='commu:nicating acro-', participant='H', time=(12888, 14050), begin='00:00:12.888', end='00:00:14.050', metadata=None),\n", - " Utterance(utterance='no: for family gender and sexuality', participant='H', time=(14050, 17014), begin='00:00:14.050', end='00:00:17.014', metadata=None),\n", - " Utterance(utterance=\"+≋ ah: that's the second on is itʔ\", participant='S', time=(17014, 18611), begin='00:00:17.014', end='00:00:18.611', metadata=None),\n", - " Utterance(utterance=\"+≋ I think it's s⌈ame family gender⌉ has a second book\", participant='H', time=(18611, 21090), begin='00:00:18.611', end='00:00:21.090', metadata=None)]" + "[Utterance(utterance='0', participant='S', time=[0, 1500], begin='00:00:00.000', end='00:00:01.500', metadata=None, utterance_clean='S x150_1500x15', utterance_list=['S', 'x150_1500x15'], n_words=2, n_characters=13, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\"mm I'm glad I saw you⇗\", participant='S', time=[1500, 2775], begin='00:00:01.500', end='00:00:02.775', metadata=None, utterance_clean='S mm Im glad I saw you x151500_2775x15', utterance_list=['S', 'mm', 'Im', 'glad', 'I', 'saw', 'you', 'x151500_2775x15'], n_words=8, n_characters=31, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\"I thought I'd lost you (0.3)\", participant='S', time=[2775, 3773], begin='00:00:02.775', end='00:00:03.773', metadata=None, utterance_clean='S I thought Id lost you x152775_3773x15 x153773_4052x15', utterance_list=['S', 'I', 'thought', 'Id', 'lost', 'you', 'x152775_3773x15', 'x153773_4052x15'], n_words=8, n_characters=48, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\"⌈no I've been here for a whi:le⌉,\", participant='H', time=[4052, 5515], begin='00:00:04.052', end='00:00:05.515', metadata=None, utterance_clean='H no Ive been here for a while x154052_5515x15', utterance_list=['H', 'no', 'Ive', 'been', 'here', 'for', 'a', 'while', 'x154052_5515x15'], n_words=9, n_characters=38, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='⌊xxx⌋ (0.3)', participant='S', time=[4052, 5817], begin='00:00:04.052', end='00:00:05.817', metadata=None, utterance_clean='S xxx x154052_5817x15 x155817_6140x15', utterance_list=['S', 'xxx', 'x154052_5817x15', 'x155817_6140x15'], n_words=4, n_characters=34, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\"⌊hm:: (.) if ʔI couldn't boʔrrow, (1.3) the second (0.2) book of readings fo:r\", participant='S', time=[6140, 9487], begin='00:00:06.140', end='00:00:09.487', metadata=None, utterance_clean='S hm if ʔI couldnt boʔrrow x156140_9487x15 the second book of readings for x159487_12888x15', utterance_list=['S', 'hm', 'if', 'ʔI', 'couldnt', 'boʔrrow', 'x156140_9487x15', 'the', 'second', 'book', 'of', 'readings', 'for', 'x159487_12888x15'], n_words=14, n_characters=78, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='commu:nicating acro-', participant='H', time=[12888, 14050], begin='00:00:12.888', end='00:00:14.050', metadata=None, utterance_clean='H communicating acro x1512888_14050x15', utterance_list=['H', 'communicating', 'acro', 'x1512888_14050x15'], n_words=4, n_characters=35, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='no: for family gender and sexuality', participant='H', time=[14050, 17014], begin='00:00:14.050', end='00:00:17.014', metadata=None, utterance_clean='H no for family gender and sexuality x1514050_17014x15', utterance_list=['H', 'no', 'for', 'family', 'gender', 'and', 'sexuality', 'x1514050_17014x15'], n_words=8, n_characters=47, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\"+≋ ah: that's the second on is itʔ\", participant='S', time=[17014, 18611], begin='00:00:17.014', end='00:00:18.611', metadata=None, utterance_clean='S ah thats the second on is itʔ x1517014_18611x15', utterance_list=['S', 'ah', 'thats', 'the', 'second', 'on', 'is', 'itʔ', 'x1517014_18611x15'], n_words=9, n_characters=41, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\"+≋ I think it's s⌈ame family gender⌉ has a second book\", participant='H', time=[18611, 21090], begin='00:00:18.611', end='00:00:21.090', metadata=None, utterance_clean='H I think its same family gender has a second book x1518611_21090x15', utterance_list=['H', 'I', 'think', 'its', 'same', 'family', 'gender', 'has', 'a', 'second', 'book', 'x1518611_21090x15'], n_words=12, n_characters=57, time_to_next=None, dyadic=None, FTO=None)]" ] }, "execution_count": 3, @@ -225,7 +225,7 @@ { "data": { "text/plain": [ - "[]" + "[]" ] }, "execution_count": 7, @@ -256,6 +256,92 @@ "source": [ "GCSAusE.write_json(path = \"CGSAusE.json\")\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyzing turn-taking dynamics\n", + "\n", + "When creating a `Conversation` object, a number of calculations and transformations are performed on the `Utterance` objects within.\n", + "For example, the number of words in each utterance is calculated, and stored under `Utterance.n_words`.\n", + "You can see this for a specific utterance as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cha01.utterances[0].n_words" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More sophisticated calculations can be performed, but do not happen automatically.\n", + "An example of this is the calculation of the Floor Transfer Offset (FTO) per utterance.\n", + "FTO is defined as the difference between the time that a turn starts, and the end of the most relevant prior turn by the other participant.\n", + "If there is overlap between these turns, the FTO is negative.\n", + "If there is a pause between these utterances, the FTO is positive.\n", + "\n", + "We can calculate the FTOs of the utterances in a conversation:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1500] S - FTO: None\n", + "[1500, 2775] S - FTO: None\n", + "[2775, 3773] S - FTO: None\n", + "[4052, 5515] H - FTO: 279\n", + "[4052, 5817] S - FTO: None\n", + "[6140, 9487] S - FTO: 625\n", + "[12888, 14050] H - FTO: 3401\n", + "[14050, 17014] H - FTO: 4563\n", + "[17014, 18611] S - FTO: 0\n", + "[18611, 21090] H - FTO: 0\n" + ] + } + ], + "source": [ + "cha01.calculate_FTO()\n", + "\n", + "for utterance in cha01.utterances[:10]:\n", + " print(f'{utterance.time} {utterance.participant} - FTO: {utterance.FTO}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To determine which prior turn is the relevant turn for FTO calculation, the following criteria are used to find a relevant utterance prior to an utterance U:\n", + "\n", + "- the relevant utterance must be by another participant\n", + "- the relevant utterance must be the most recent utterance by that participant\n", + "- the relevant utterance must have started more than a specified number of ms before the start of U. This time defaults to 200 ms, but can be changed with the `planning_buffer` argument.\n", + "- the relevant utterance must be partly or entirely within the context window. The context window is defined as 10s (or 10000ms) prior to the utterance U. The size of this window can be changed with the `window` argument.\n", + "- within the context window, there must be a maximum of 2 speakers, which can be changed to 3 with the `n_participants` argument." + ] } ], "metadata": { From 574a7297063830d660aafd0f4351189f2b6f5728 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 29 Nov 2023 09:14:24 +0100 Subject: [PATCH 45/53] Update sktalk/corpus/conversation.py Co-authored-by: Ji Qi <92043159+jiqicn@users.noreply.github.com> --- sktalk/corpus/conversation.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index a34574b..2cbb9ba 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -92,11 +92,9 @@ def _subconversation_by_index(self, raise IndexError("Index out of range") if after is None: after = before - if index - before < 0: - before = index - if index + after + 1 > len(self._utterances): - after = len(self._utterances) - index - 1 - returned_utterances = self._utterances[index-before:index+after+1] + left_bound = max(index-before, 0) + right_bound = min(index + after + 1, len(self._utterances)) + returned_utterances = self._utterances[left_bound:right_bound] return Conversation(utterances=returned_utterances, suppress_warnings=True) def _subconversation_by_time(self, From 473e7536a4af9d2bd13f092e1e30961fc1eb97bf Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 29 Nov 2023 09:54:53 +0100 Subject: [PATCH 46/53] Update sktalk/corpus/conversation.py Co-authored-by: Ji Qi <92043159+jiqicn@users.noreply.github.com> --- sktalk/corpus/conversation.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 2cbb9ba..2817840 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -228,8 +228,4 @@ def overlap(begin: int, end: int, time: list): # time[0] falls between begin and end # time[1] falls between and end # time[0] is before begin and time[1] is after end - if time is None: - return False - if begin <= time[0] <= end or begin <= time[1] <= end: - return True - return time[0] <= begin and time[1] >= end + return bool(time) and end >= time[0] and begin <= time[1] From e076b6cbed680ba6e25824243ecbe3cd9d90497b Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 29 Nov 2023 09:54:47 +0100 Subject: [PATCH 47/53] add comments re: error --- sktalk/corpus/conversation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 2817840..e021b1a 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -130,6 +130,9 @@ def _subconversation_by_time(self, returned_utterances = [ u for u in self._utterances if self.overlap(begin, end, u.time) or u == self._utterances[index]] except (TypeError, IndexError): + # if the utterance's timing is None, a TypeError is raised + # if the utterance has no time[0] or time[1], an IndexError is raised + # In both cases, there is missing timing information, so no data can be returned. returned_utterances = [] return Conversation(utterances=returned_utterances, suppress_warnings=True) From 3af95fc9a8c2ca6d3cf712d7262fbb03bf04fb5b Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Wed, 29 Nov 2023 09:58:33 +0100 Subject: [PATCH 48/53] Update sktalk/corpus/conversation.py Co-authored-by: Ji Qi <92043159+jiqicn@users.noreply.github.com> --- sktalk/corpus/conversation.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index e021b1a..6fc8afd 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -215,11 +215,7 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti continue potentials = [ u for u in sub.utterances if utterance.relevant_for_fto(u, planning_buffer)] - try: - relevant = potentials[-1] - values.append(relevant.until(utterance)) - except IndexError: - values.append(None) + values.append(potentials[-1].until(utterance) if potentials else None) self._update("FTO", values, window=window, planning_buffer=planning_buffer, From 4c7ac0c053fe6f5c1243ea928efc6c93260e29f7 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Thu, 30 Nov 2023 17:23:10 +0100 Subject: [PATCH 49/53] rewrite FTO calculation --- sktalk/corpus/conversation.py | 108 ++++++++++++++++++++---------- sktalk/corpus/utterance.py | 33 ++++----- tests/conftest.py | 2 +- tests/corpus/test_conversation.py | 34 ++++------ 4 files changed, 102 insertions(+), 75 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index 6fc8afd..b8577b8 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -89,46 +89,55 @@ def _subconversation_by_index(self, Conversation: Conversation object without metadata, containing a reduced set of utterances """ if index < 0 or index >= len(self._utterances): - raise IndexError("Index out of range") + raise IndexError("Utterance index out of range") if after is None: after = before left_bound = max(index-before, 0) right_bound = min(index + after + 1, len(self._utterances)) - returned_utterances = self._utterances[left_bound:right_bound] - return Conversation(utterances=returned_utterances, suppress_warnings=True) + return Conversation(utterances=self._utterances[left_bound:right_bound], + suppress_warnings=True) def _subconversation_by_time(self, index: int, before: int = 0, - after: Optional[int] = None, + after: int = 0, exclude_utterance_overlap: bool = False) -> "Conversation": """Select utterances to provide context as a sub-conversation Args: index (int): The index of the utterance for which to provide context before (int, optional): The time in ms preceding the utterance's begin. Defaults to 0. - after (int, optional): The time in ms following the utterance's end. Defaults to None, - which then assumes the same value as `before`. + after (int, optional): The time in ms following the utterance's end. Defaults to 0 exclude_utterance_overlap (bool, optional): If True, the duration of the utterance itself is not used to identify overlapping utterances, and only the window before or after the utterance is used. Defaults to False. + If True, only one of `before` or `after` can be more than 0, as the window + for overlap will be limited to the window preceding or following the utterance. Returns: Conversation: Conversation object without metadata, containing a reduced set of utterances """ if index < 0 or index >= len(self._utterances): - raise IndexError("Index out of range") - if after is None: - after = before + raise IndexError("Utterance index out of range") + if exclude_utterance_overlap and before > 0 and after > 0: + raise ValueError( + "When utterance is excluded from overlap window, only one of before or after can be more than 0") try: begin = self._utterances[index].time[0] - before end = self._utterances[index].time[1] + after + left_bound, right_bound = None, None if exclude_utterance_overlap and before == 0: # only overlap with window following utterance begin = self._utterances[index].time[1] + left_bound = index elif exclude_utterance_overlap and after == 0: # only overlap with window preceding utterance end = self._utterances[index].time[0] - returned_utterances = [ - u for u in self._utterances if self.overlap(begin, end, u.time) or u == self._utterances[index]] + right_bound = index + 1 + indices = [i for i, u in enumerate( + self._utterances) if u._overlap([begin, end])] + left_bound = left_bound if bool(left_bound) else min(indices) + right_bound = right_bound if bool( + right_bound) else max(indices) + 1 + returned_utterances = self._utterances[left_bound:right_bound] except (TypeError, IndexError): # if the utterance's timing is None, a TypeError is raised # if the utterance has no time[0] or time[1], an IndexError is raised @@ -187,6 +196,32 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti within the window that do not have timing information, or if it lacks timing information itself. + Args: + window (int, optional): the time in ms prior to utterance in which a + relevant preceding utterance can be found. Defaults to 10000. + planning_buffer (int, optional): minimum speaking time in ms to allow for a response. + Defaults to 200. + n_participants (int, optional): maximum number of participants overlapping with + the utterance and preceding window. Defaults to 2. + """ + values = [] + for index, utterance in enumerate(self.utterances): + relevant = self.relevant_prior_utterance( + index, window, planning_buffer, n_participants) + values.append(relevant.until(utterance) + if bool(relevant) else None) + self._update("FTO", values, + window=window, + planning_buffer=planning_buffer, + n_participants=n_participants) + + def relevant_prior_utterance(self, + index, + window=10000, + planning_buffer=200, + n_participants=2): + """Determine the most relevant prior utterance for a given utterance + To be a relevant prior turn, the following conditions must be met, respective to utterance U: - the utterance must be by another speaker than U - the utterance by the other speaker must be the most recent utterance by that speaker @@ -196,35 +231,38 @@ def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_parti - within the context window, there must be a maximum of `n_participants` speakers. Args: + index (int): index of the utterance to assess window (int, optional): the time in ms prior to utterance in which a relevant preceding utterance can be found. Defaults to 10000. planning_buffer (int, optional): minimum speaking time in ms to allow for a response. Defaults to 200. n_participants (int, optional): maximum number of participants overlapping with the utterance and preceding window. Defaults to 2. + + Returns: + Utterance: the most relevant prior utterance, or None, if no relevant prior utterance can be identified """ - values = [] - for index, utterance in enumerate(self.utterances): - sub = self._subconversation_by_time( - index=index, - before=window, - after=0, - exclude_utterance_overlap=True) - if not 2 <= sub.count_participants() <= n_participants: - values.append(None) + sub = self._subconversation_by_time( + index=index, + before=window, + after=0, + exclude_utterance_overlap=True) + if not 2 <= sub.count_participants() <= n_participants: + return None + must_overlap = [] + for prior in sub.utterances[::-1]: + # if timing information is missing, stop looking for relevant utterances + if not bool(prior.time): + break + # if the utterance is by the same speaker, it is not relevant, + # but must overlap with potential relevant utterance + if self._utterances[index].same_speaker(prior): + must_overlap.append(prior) continue - potentials = [ - u for u in sub.utterances if utterance.relevant_for_fto(u, planning_buffer)] - values.append(potentials[-1].until(utterance) if potentials else None) - self._update("FTO", values, - window=window, - planning_buffer=planning_buffer, - n_participants=n_participants) - - @staticmethod - def overlap(begin: int, end: int, time: list): - # there is overlap if: - # time[0] falls between begin and end - # time[1] falls between and end - # time[0] is before begin and time[1] is after end - return bool(time) and end >= time[0] and begin <= time[1] + # the relevant utterance must precede utterance U more than planning buffer + if not self._utterances[index].precede_with_buffer(prior, planning_buffer): + continue + # verify that all utterances in must_overlap do so + if all(prior.overlap(utt) for utt in must_overlap): + return prior + return None diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index eda3024..b47846c 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -57,29 +57,24 @@ def _clean_utterance(self): # remove numbers that are surrounded by spaces self.utterance_clean = re.sub(r'\s\d+\s', ' ', self.utterance_clean) - def until(self, next_utt): - return next_utt.time[0] - self.time[1] + def until(self, other): + return other.time[0] - self.time[1] - def relevant_for_fto(self, prior_utt, planning_buffer: int): - """Assess whether an utterance is potentially relevant to calculate FTO + def overlap(self, other): + return self._overlap(other.time) - An utterance is potentially relevant for fto calculation if: - - the utterance `prior_utt` must be by another speaker - - the utterance `prior_utt` must have started before the utterance itself, more than `planning_buffer` ms before. + def _overlap(self, time): + if not self.time or not time: + return False + return self.time[1] >= time[0] and self.time[0] <= time[1] - The planning buffer is the minimum time between a relevant preceding utterance and the utterance itself + def same_speaker(self, other): + return self.participant == other.participant - Args: - prior_utt (Utterance): utterance to assess - planning_buffer (int): buffer time (in ms) - - Returns: - bool: whether the utterance `prior_utt` meets the criteria and is potentially relevant for FTO calculation - """ - return ( - self.participant != prior_utt.participant - and self.time[0] - planning_buffer >= prior_utt.time[0] - ) + def precede_with_buffer(self, other, planning_buffer=200): + if not bool(self.time) or not bool(other.time): + return None + return self.time[0] - planning_buffer >= other.time[0] def _split_time(self): try: diff --git a/tests/conftest.py b/tests/conftest.py index d574f30..6c63e35 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,7 +38,7 @@ def convo_utts(): Utterance( utterance="2 utterance C", participant="A", - time=[1001, 12000] + time=[1001, 8500] ), Utterance( utterance="3 utterance D", diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index 5aefcf4..ef72af3 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -80,32 +80,26 @@ def test_subconversation_index(self, convo, args, expected_length): @pytest.mark.parametrize("args, expected_length", [ - ([0, 0, 0], 2), # A, B - ([5, 3000, 3000], 6), # B,C,E,U,F,H - ([5, 0, 0], 3), # C, U, F + ([0, 0, 0, False], 2), # A, B + ([5, 3000, 3000, False], 8), # B-H + ([5, 0, 0, False], 5), # C-F + # no time window, only return U + ([5, 0, 0, True], 1), + # 7 has no timing + ([7, 1000, 1000, False], 0), + ([5, 0, 1500, True], 4), # U-H + ([5, 1000, 0, True], 4), # C-U ]) - def test_subconversation(self, convo, args, expected_length): - index, before, after = args + def test_subconversation_time(self, convo, args, expected_length): + index, before, after, exclude = args sub = convo._subconversation_by_time(index=index, # noqa W0212 before=before, - after=after) + after=after, + exclude_utterance_overlap=exclude + ) assert isinstance(sub, Conversation) assert len(sub.utterances) == expected_length - def test_overlap(self): - # entire utterance in window - assert Conversation.overlap(80, 120, [90, 110]) - # beginning of utterance in window - assert Conversation.overlap(80, 100, [90, 110]) - # end of utterance in window - assert Conversation.overlap(100, 120, [90, 110]) - # utterance covers window entirely - assert Conversation.overlap(95, 105, [90, 110]) - assert not Conversation.overlap( - 120, 140, [90, 110]) # utterance before window - assert not Conversation.overlap( - 70, 80, [90, 110]) # utterance after window - def test_count_participants(self, convo): assert convo.count_participants() == 4 assert convo.count_participants(except_none=True) == 3 From ada29cee5bbaed641dbd403d69d2037b6359286b Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Thu, 30 Nov 2023 17:32:45 +0100 Subject: [PATCH 50/53] rename overlap function to make it available --- sktalk/corpus/conversation.py | 2 +- sktalk/corpus/utterance.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index b8577b8..f25d3e8 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -133,7 +133,7 @@ def _subconversation_by_time(self, end = self._utterances[index].time[0] right_bound = index + 1 indices = [i for i, u in enumerate( - self._utterances) if u._overlap([begin, end])] + self._utterances) if u.window_overlap([begin, end])] left_bound = left_bound if bool(left_bound) else min(indices) right_bound = right_bound if bool( right_bound) else max(indices) + 1 diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index b47846c..7a8959e 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -61,9 +61,9 @@ def until(self, other): return other.time[0] - self.time[1] def overlap(self, other): - return self._overlap(other.time) + return self.window_overlap(other.time) - def _overlap(self, time): + def window_overlap(self, time): if not self.time or not time: return False return self.time[1] >= time[0] and self.time[0] <= time[1] From 13829bbb07eb444c342e8b2a56b10a8653fa017c Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 1 Dec 2023 12:54:33 +0100 Subject: [PATCH 51/53] update FTO calculation to account for partial overlap --- sktalk/corpus/conversation.py | 15 ++++--- sktalk/corpus/utterance.py | 15 ++++++- tests/conftest.py | 71 +++++++++++++++++++++++++++++++ tests/corpus/test_conversation.py | 52 ++++++++++++++++------ tests/corpus/test_utterance.py | 25 +++++++++++ 5 files changed, 160 insertions(+), 18 deletions(-) diff --git a/sktalk/corpus/conversation.py b/sktalk/corpus/conversation.py index f25d3e8..b49ecad 100644 --- a/sktalk/corpus/conversation.py +++ b/sktalk/corpus/conversation.py @@ -242,6 +242,9 @@ def relevant_prior_utterance(self, Returns: Utterance: the most relevant prior utterance, or None, if no relevant prior utterance can be identified """ + utterance_u = self._utterances[index] + if not bool(utterance_u.time) or not bool(utterance_u.participant): + return None sub = self._subconversation_by_time( index=index, before=window, @@ -251,18 +254,20 @@ def relevant_prior_utterance(self, return None must_overlap = [] for prior in sub.utterances[::-1]: - # if timing information is missing, stop looking for relevant utterances - if not bool(prior.time): + # if timing or participant information is missing, stop looking for relevant utterances + if not bool(prior.time) or not bool(prior.participant): break + if prior == utterance_u: + continue # if the utterance is by the same speaker, it is not relevant, # but must overlap with potential relevant utterance - if self._utterances[index].same_speaker(prior): + if utterance_u.same_speaker(prior): must_overlap.append(prior) continue # the relevant utterance must precede utterance U more than planning buffer - if not self._utterances[index].precede_with_buffer(prior, planning_buffer): + if not utterance_u.precede_with_buffer(prior, planning_buffer): continue # verify that all utterances in must_overlap do so - if all(prior.overlap(utt) for utt in must_overlap): + if all(utt.overlap_percentage(prior) == 100 for utt in must_overlap): return prior return None diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 7a8959e..7ca3958 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -68,8 +68,21 @@ def window_overlap(self, time): return False return self.time[1] >= time[0] and self.time[0] <= time[1] + def overlap_percentage(self, other): + return self.window_overlap_percentage(other.time) + + def window_overlap_percentage(self, time): + if not self.time or not time: + return None + overlap = self.window_overlap(time) + if not overlap: + return 0 + overlap_duration = min( + self.time[1], time[1]) - max(self.time[0], time[0]) + return overlap_duration / (self.time[1] - self.time[0]) * 100 + def same_speaker(self, other): - return self.participant == other.participant + return self.participant == other.participant if bool(self.participant) and bool(other.participant) else None def precede_with_buffer(self, other, planning_buffer=200): if not bool(self.time) or not bool(other.time): diff --git a/tests/conftest.py b/tests/conftest.py index 6c63e35..116caa9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -83,6 +83,77 @@ def convo(convo_utts, convo_meta): return Conversation(convo_utts, convo_meta) +@pytest.fixture +def utterances_for_fto(): + return [ + Utterance( + utterance="utt 0 - A", + participant="A", + time=[0, 1000] + ), + Utterance( + utterance="utt 1 - B", + participant="B", + time=[200, 300] + ), + Utterance( + utterance="utt 2 - B", + participant="B", + time=[400, 500] + ), + Utterance( + utterance="utt 3 - B", + participant="B", + time=[600, 900] + ), + Utterance( + utterance="utt 4 - B", + participant="B", + time=[1100, 1500] + ), + Utterance( + utterance="utt 5 - A", + participant="A", + time=None + ), + Utterance( + utterance="utt 6 - A", + participant="A", + time=[1300, 1800] + ), + Utterance( + utterance="utt 7 - B", + participant="B", + time=[1450, 1800] + ), + Utterance( + utterance="utt 8 - None", + participant=None, + time=[1850, 1900] + ), + Utterance( + utterance="utt 9 - A", + participant="A", + time=[1900, 2300] + ), + Utterance( + utterance="utt 10 - B", + participant="B", + time=[2200, 2400] + ), + Utterance( + utterance="utt 11 - B", + participant="B", + time=[2450, 2600] + ), + ] + + +@pytest.fixture +def convo_fto(utterances_for_fto, convo_meta): + return Conversation(utterances_for_fto, convo_meta) + + @pytest.fixture def my_corpus(): return Corpus(language="French", diff --git a/tests/corpus/test_conversation.py b/tests/corpus/test_conversation.py index ef72af3..e717920 100644 --- a/tests/corpus/test_conversation.py +++ b/tests/corpus/test_conversation.py @@ -108,15 +108,43 @@ def test_count_participants(self, convo): convo3 = convo._subconversation_by_index(index=0) # noqa W0212 assert convo3.count_participants() == 1 - def test_calculate_FTO(self, convo): - convo.calculate_FTO() - assert convo.metadata["Calculations"]["FTO"] == { - "window": 10000, "planning_buffer": 200, "n_participants": 2} - convo.calculate_FTO(window=10) - assert convo.metadata["Calculations"]["FTO"] == { - "window": 10, "planning_buffer": 200, "n_participants": 2} - assert convo.utterances[0].FTO is None - assert convo.utterances[1].FTO == -100 - assert convo.utterances[2].FTO is None - convo.calculate_FTO(planning_buffer=0) - assert convo.utterances[2].FTO == -2499 + @pytest.mark.parametrize("args, index, expected_fto", + [ + ([10000, 200, 2], 0, None), + ([10000, 200, 2], 1, -800), # from 1 to 0 + # no FTO possible in 1 person convo + ([10000, 200, 1], 1, None), + ([10000, 200, 2], 2, -600), # from 2 to 0 + ([10000, 200, 2], 3, -400), # from 3 to 0 + ([1, 200, 2], 3, -400), # 0 still overlaps + ([10000, 200, 2], 4, 100), # from 4 to 0 + # 0 does not fit in window + ([1, 200, 2], 4, None), + # timing info missing + ([10000, 200, 2], 5, None), + # timing info on previous missing + ([10000, 200, 2], 6, None), + # utterance starts <200ms after prior + ([10000, 200, 2], 7, None), + # planning buffer adjusted + ([10000, 100, 2], 7, -350), + # missing participant + ([10000, 200, 2], 8, None), + # missing participant in previous + ([10000, 200, 2], 9, None), + ([100, 200, 2], 10, -100), # fom 10 to 9 + # previous only has partial overlap + ([400, 200, 2], 11, None) + ]) + def test_calculate_FTO(self, convo_fto, args, index, expected_fto): + window, planning_buffer, n_participants = args + convo_fto.calculate_FTO(window, planning_buffer, n_participants) + + # metadata is updated + assert convo_fto.metadata["Calculations"]["FTO"] == { + "window": window, + "planning_buffer": planning_buffer, + "n_participants": n_participants} + + # utterance fto is calculated correctly + assert convo_fto.utterances[index].FTO == expected_fto diff --git a/tests/corpus/test_utterance.py b/tests/corpus/test_utterance.py index 861e866..7a0c535 100644 --- a/tests/corpus/test_utterance.py +++ b/tests/corpus/test_utterance.py @@ -35,6 +35,31 @@ def test_until(self, convo_utts): utt1, utt2 = convo_utts[:2] assert utt1.until(utt2) == -100 + @pytest.mark.parametrize("indices, expected", [ + ([0, 1], False), + ([0, 2], True), + ([0, 7], None), + ([1, 7], None), + ([7, 7], None) + ]) + def test_same_speaker(self, convo_utts, indices, expected): + utt1, utt2 = [convo_utts[i] for i in indices] + assert utt1.same_speaker(utt2) == expected + + @pytest.mark.parametrize("utterance_time, window, expected_percentage", [ + ([100, 200], [100, 200], 100), + ([100, 200], [150, 250], 50), + ([100, 200], [0, 400], 100), + ([0, 400], [100, 200], 25), + (None, [0, 300], None), + ([100, 200], None, None), + ([100, 200], [400, 500], 0) + ]) + def test_window_overlap_percentage(self, utterance_time, window, expected_percentage): + utterance = Utterance(utterance="text", time=utterance_time) + assert utterance.window_overlap_percentage( + window) == expected_percentage + milliseconds_timestamp = [ ["0", "00:00:00.000"], ["1706326", "00:28:26.326"], From 2f64c9404ce0907083bddec93d844e0a5e3f9039 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 1 Dec 2023 14:34:21 +0100 Subject: [PATCH 52/53] refactor overlap functions --- sktalk/corpus/utterance.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/sktalk/corpus/utterance.py b/sktalk/corpus/utterance.py index 7ca3958..aec6f05 100644 --- a/sktalk/corpus/utterance.py +++ b/sktalk/corpus/utterance.py @@ -64,22 +64,28 @@ def overlap(self, other): return self.window_overlap(other.time) def window_overlap(self, time): - if not self.time or not time: - return False + if not bool(self.time) or not bool(time): + return None return self.time[1] >= time[0] and self.time[0] <= time[1] + def overlap_duration(self, other): + return self.window_overlap_duration(other.time) + + def window_overlap_duration(self, time): + overlap = self.window_overlap(time) + if not bool(overlap): + return overlap if overlap is None else int(overlap) + return min(self.time[1], time[1]) - max(self.time[0], time[0]) + def overlap_percentage(self, other): return self.window_overlap_percentage(other.time) def window_overlap_percentage(self, time): - if not self.time or not time: - return None - overlap = self.window_overlap(time) - if not overlap: - return 0 - overlap_duration = min( - self.time[1], time[1]) - max(self.time[0], time[0]) - return overlap_duration / (self.time[1] - self.time[0]) * 100 + overlap_duration = self.window_overlap_duration(time) + if not bool(overlap_duration): + return overlap_duration + utterance_duration = self.time[1] - self.time[0] + return overlap_duration / utterance_duration * 100 def same_speaker(self, other): return self.participant == other.participant if bool(self.participant) and bool(other.participant) else None From b1e096f973a1b078b3d1b69bc16c241ca42e830a Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 1 Dec 2023 14:36:43 +0100 Subject: [PATCH 53/53] Update sktalk/corpus/parsing/cha.py Co-authored-by: Ji Qi <92043159+jiqicn@users.noreply.github.com> --- sktalk/corpus/parsing/cha.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sktalk/corpus/parsing/cha.py b/sktalk/corpus/parsing/cha.py index 50999d1..8e97258 100644 --- a/sktalk/corpus/parsing/cha.py +++ b/sktalk/corpus/parsing/cha.py @@ -30,10 +30,7 @@ def _to_utterance(chat_utterance) -> Utterance: utterance=str(chat_utterance.tiers), ) utterance.utterance = ChaFile._clean_utterance(utterance.utterance) - try: - utterance.time = list(utterance.time) - except TypeError: - utterance.time = None + utterance.time = list(utterance.time) if isinstance(utterance.time, (list, tuple)) else None return utterance def _extract_metadata(self):