diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py index 0484894b..4c7df037 100644 --- a/convokit/model/corpus.py +++ b/convokit/model/corpus.py @@ -1,3 +1,5 @@ +from pandas import DataFrame +from tqdm import tqdm from typing import List, Collection, Callable, Set, Generator, Tuple, Optional, ValuesView, Union from .corpusHelper import * from convokit.util import deprecation, warn @@ -348,11 +350,11 @@ def get_utterances_dataframe(self, selector: Optional[Callable[[Utterance], bool Get a DataFrame of the utterances with fields and metadata attributes, with an optional selector that filters utterances that should be included. Edits to the DataFrame do not change the corpus in any way. - :param exclude_meta: whether to exclude metadata - :param selector: a (lambda) function that takes a Utterance and returns True or False (i.e. include / exclude). - By default, the selector includes all Utterances in the Corpus. - :return: a pandas DataFrame - """ + :param exclude_meta: whether to exclude metadata + :param selector: a (lambda) function that takes a Utterance and returns True or False (i.e. include / exclude). + By default, the selector includes all Utterances in the Corpus. + :return: a pandas DataFrame + """ return get_utterances_dataframe(self, selector, exclude_meta) def iter_conversations(self, selector: Optional[Callable[[Conversation], bool]] = lambda convo: True) -> Generator[ @@ -372,19 +374,19 @@ def get_conversations_dataframe(self, selector: Optional[Callable[[Conversation] exclude_meta: bool = False): """ Get a DataFrame of the conversations with fields and metadata attributes, with an optional selector that filters - for conversations that should be included. Edits to the DataFrame do not change the corpus in any way. + for conversations that should be included. Edits to the DataFrame do not change the corpus in any way. - :param exclude_meta: whether to exclude metadata - :param selector: a (lambda) function that takes a Conversation and returns True or False (i.e. include / exclude). - By default, the selector includes all Conversations in the Corpus. - :return: a pandas DataFrame - """ + :param exclude_meta: whether to exclude metadata + :param selector: a (lambda) function that takes a Conversation and returns True or False (i.e. include / exclude). + By default, the selector includes all Conversations in the Corpus. + :return: a pandas DataFrame + """ return get_conversations_dataframe(self, selector, exclude_meta) def iter_speakers(self, selector: Optional[Callable[[Speaker], bool]] = lambda speaker: True) -> \ Generator[Speaker, None, None]: """ - Get Speakers in the Corpus, with an optional selector that filters for Speakers that should be included + Get Speakers in the Corpus, with an optional selector that filters for Speakers that should be included :param selector: a (lambda) function that takes a Speaker and returns True or False (i.e. include / exclude). By default, the selector includes all Speakers in the Corpus. @@ -399,13 +401,13 @@ def get_speakers_dataframe(self, selector: Optional[Callable[[Speaker], bool]] = exclude_meta: bool = False): """ Get a DataFrame of the Speakers with fields and metadata attributes, with an optional selector that filters - Speakers that should be included. Edits to the DataFrame do not change the corpus in any way. + Speakers that should be included. Edits to the DataFrame do not change the corpus in any way. - :param exclude_meta: whether to exclude metadata - :param selector: selector: a (lambda) function that takes a Speaker and returns True or False - (i.e. include / exclude). By default, the selector includes all Speakers in the Corpus. - :return: a pandas DataFrame - """ + :param exclude_meta: whether to exclude metadata + :param selector: selector: a (lambda) function that takes a Speaker and returns True or False + (i.e. include / exclude). By default, the selector includes all Speakers in the Corpus. + :return: a pandas DataFrame + """ return get_speakers_dataframe(self, selector, exclude_meta) def iter_users(self, selector=lambda speaker: True): @@ -494,11 +496,11 @@ def get_usernames(self, selector: Optional[Callable[[Speaker], bool]] = lambda u def filter_conversations_by(self, selector: Callable[[Conversation], bool]): """ - Mutate the corpus by filtering for a subset of Conversations within the Corpus. + Mutate the corpus by filtering for a subset of Conversations within the Corpus. - :param selector: function for selecting which Conversations to keep - :return: the mutated Corpus - """ + :param selector: function for selecting which Conversations to keep + :return: the mutated Corpus + """ self.conversations = {convo_id: convo for convo_id, convo in self.conversations.items() if selector(convo)} utt_ids = set([utt for convo in self.conversations.values() for utt in convo.get_utterance_ids()]) @@ -1269,6 +1271,64 @@ def get_full_attribute_table(self, speaker_convo_attrs, speaker_attrs=None, conv c_df.columns = [x + convo_suffix for x in c_df.columns] return uc_df.join(u_df, on='speaker').join(c_df, on='convo_id') + @staticmethod + def from_pandas(speakers_df: DataFrame, utterances_df: DataFrame, conversations_df: DataFrame) -> 'Corpus': + + """ + Generates a Corpus from speakers, utterances and conversations dataframes. + If the 'id' column is absent, the dataframe index will be used as the id. + + Arguments: + speakers_df {DataFrame} -- speakers data, in a pandas Dataframe, with metadata optional + utterances_df {DataFrame} -- utterances data, in a pandas Dataframe. All primary data fields expected, with metadata optional + conversations_df {DataFrame} -- conversations data, in a pandas Dataframe, with metadata optional + + Returns: + Corpus -- the generated corpus + """ + #dict containing all primary fields expected in utterance dataframe + columns = ['speaker', 'id', 'timestamp', 'conversation_id', 'reply_to', 'text'] + + for (df_type, df) in [('utterances', utterances_df), ('conversations', conversations_df), + ('speakers', speakers_df)]: + if 'id' not in df.columns: + print(f'ID column is not present in {df_type} dataframe, generated ID column from dataframe index...') + df['id'] = df.index + + #checking if dataframes contain their respective required columns + assert pd.Series(columns).isin(utterances_df.columns).all(), "Utterances dataframe must contain all primary data fields" + + utterance_meta_cols = extract_meta_from_df(utterances_df) + speaker_meta_cols = extract_meta_from_df(speakers_df) + speakers_df.columns = [col.replace('meta.', '') for col in speakers_df.columns] + + utterance_list = [] + for index, row in tqdm(utterances_df.iterrows()): + + # extracting utterance metadata + if utterance_meta_cols: + metadata = {} + for meta_col in utterance_meta_cols: + metadata[meta_col] = row['meta.' + meta_col] + else: + metadata = None + + # extracting speaker metadata from speakers_df + speaker_meta = speakers_df[speakers_df['id'] == row['speaker']][speaker_meta_cols].to_dict(orient='records')[0] if speaker_meta_cols else None + + # adding utterance in utterance list + utterance_list.append(Utterance(id=str(row['id']), speaker=Speaker(id=row['speaker'], meta=speaker_meta), + conversation_id=row['conversation_id'], reply_to=row['reply_to'], + timestamp=row['timestamp'], text=row['text'], + meta=metadata)) + # initializing corpus using utterance_list + corpus_from_pandas = Corpus(utterances=utterance_list) + + # updating conversation metadata in corpus + corpus_from_pandas = add_conv_meta_df(conversations_df, corpus_from_pandas) + + return corpus_from_pandas + # def __repr__(self): # def __eq__(self, other): # return True diff --git a/convokit/model/corpusHelper.py b/convokit/model/corpusHelper.py index e5aa20d2..f0f2fca7 100644 --- a/convokit/model/corpusHelper.py +++ b/convokit/model/corpusHelper.py @@ -358,4 +358,19 @@ def dump_jsonlist_from_dict(entries, filename, index_key='id', value_key='value' with open(filename, 'w') as f: for k, v in entries.items(): json.dump({index_key: k, value_key: v}, f) - f.write('\n') \ No newline at end of file + f.write('\n') + +def extract_meta_from_df(df): + meta_cols = [col.split(".")[1] for col in df if col.startswith('meta')] + return meta_cols + +def add_conv_meta_df(conversations_df, corpus): + conv_meta_cols = extract_meta_from_df(conversations_df) + conversations_df.columns = [col.replace('meta.', '') for col in conversations_df.columns] + for convo in corpus.iter_conversations(): + # get the conv_id for the conversation by checking from utterance info + convo_id = convo.get_id() + conversation_meta = conversations_df[conversations_df['id'] == convo_id][conv_meta_cols].to_dict(orient='records')[0] if conv_meta_cols else None + convo.meta.update(conversation_meta) + + return corpus \ No newline at end of file diff --git a/convokit/tests/general/test_from_pandas.py b/convokit/tests/general/test_from_pandas.py new file mode 100644 index 00000000..0c7a5397 --- /dev/null +++ b/convokit/tests/general/test_from_pandas.py @@ -0,0 +1,32 @@ +import unittest +from convokit.model import Utterance, Speaker, Corpus +from convokit import download + +class CorpusFromPandas(unittest.TestCase): + + def setUp(self) -> None: + self.corpus = Corpus(download('subreddit-hey')) + utt_df = self.corpus.get_utterances_dataframe() + convo_df = self.corpus.get_conversations_dataframe() + speaker_df = self.corpus.get_speakers_dataframe() + self.new_corpus = Corpus.from_pandas(speaker_df, utt_df, convo_df) + + def test_reconstruction_stats(self): + """ + Test that reconstructing the Corpus from outputted dataframes results in the same number of corpus components + """ + assert len(self.new_corpus.speakers) == len(self.corpus.speakers) + assert len(self.new_corpus.conversations) == len(self.corpus.conversations) + assert len(self.new_corpus.utterances) == len(self.corpus.utterances) + + def test_reconstruction_metadata(self): + assert set(self.corpus.random_utterance().meta) == set(self.new_corpus.random_utterance().meta) + assert set(self.corpus.random_conversation().meta) == set(self.new_corpus.random_conversation().meta) + assert set(self.corpus.random_speaker().meta) == set(self.new_corpus.random_speaker().meta) + + def test_convo_reconstruction(self): + for convo in self.new_corpus.iter_conversations(): + assert convo.check_integrity(verbose=False) + +if __name__ == '__main__': + unittest.main() diff --git a/requirements.txt b/requirements.txt index 5ccbe2e8..b0710d9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ nltk>=3.4 dill>=0.2.9 torch>=1.2.0 clean-text>=0.1.1 -joblib>=0.13.2 \ No newline at end of file +joblib>=0.13.2 +tqdm \ No newline at end of file