Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added 'from_pandas' method to create Corpus from DataFrames. #136

Merged
merged 7 commits into from
Sep 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 82 additions & 22 deletions convokit/model/corpus.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pandas import DataFrame
from tqdm import tqdm
from typing import List, Collection, Callable, Set, Generator, Tuple, Optional, ValuesView, Union
from .corpusHelper import *
from convokit.util import deprecation, warn
Expand Down Expand Up @@ -348,11 +350,11 @@ def get_utterances_dataframe(self, selector: Optional[Callable[[Utterance], bool
Get a DataFrame of the utterances with fields and metadata attributes, with an optional selector that filters
utterances that should be included. Edits to the DataFrame do not change the corpus in any way.

:param exclude_meta: whether to exclude metadata
:param selector: a (lambda) function that takes a Utterance and returns True or False (i.e. include / exclude).
By default, the selector includes all Utterances in the Corpus.
:return: a pandas DataFrame
"""
:param exclude_meta: whether to exclude metadata
:param selector: a (lambda) function that takes a Utterance and returns True or False (i.e. include / exclude).
By default, the selector includes all Utterances in the Corpus.
:return: a pandas DataFrame
"""
return get_utterances_dataframe(self, selector, exclude_meta)

def iter_conversations(self, selector: Optional[Callable[[Conversation], bool]] = lambda convo: True) -> Generator[
Expand All @@ -372,19 +374,19 @@ def get_conversations_dataframe(self, selector: Optional[Callable[[Conversation]
exclude_meta: bool = False):
"""
Get a DataFrame of the conversations with fields and metadata attributes, with an optional selector that filters
for conversations that should be included. Edits to the DataFrame do not change the corpus in any way.
for conversations that should be included. Edits to the DataFrame do not change the corpus in any way.

:param exclude_meta: whether to exclude metadata
:param selector: a (lambda) function that takes a Conversation and returns True or False (i.e. include / exclude).
By default, the selector includes all Conversations in the Corpus.
:return: a pandas DataFrame
"""
:param exclude_meta: whether to exclude metadata
:param selector: a (lambda) function that takes a Conversation and returns True or False (i.e. include / exclude).
By default, the selector includes all Conversations in the Corpus.
:return: a pandas DataFrame
"""
return get_conversations_dataframe(self, selector, exclude_meta)

def iter_speakers(self, selector: Optional[Callable[[Speaker], bool]] = lambda speaker: True) -> \
Generator[Speaker, None, None]:
"""
Get Speakers in the Corpus, with an optional selector that filters for Speakers that should be included
Get Speakers in the Corpus, with an optional selector that filters for Speakers that should be included

:param selector: a (lambda) function that takes a Speaker and returns True or False (i.e. include / exclude).
By default, the selector includes all Speakers in the Corpus.
Expand All @@ -399,13 +401,13 @@ def get_speakers_dataframe(self, selector: Optional[Callable[[Speaker], bool]] =
exclude_meta: bool = False):
"""
Get a DataFrame of the Speakers with fields and metadata attributes, with an optional selector that filters
Speakers that should be included. Edits to the DataFrame do not change the corpus in any way.
Speakers that should be included. Edits to the DataFrame do not change the corpus in any way.

:param exclude_meta: whether to exclude metadata
:param selector: selector: a (lambda) function that takes a Speaker and returns True or False
(i.e. include / exclude). By default, the selector includes all Speakers in the Corpus.
:return: a pandas DataFrame
"""
:param exclude_meta: whether to exclude metadata
:param selector: selector: a (lambda) function that takes a Speaker and returns True or False
(i.e. include / exclude). By default, the selector includes all Speakers in the Corpus.
:return: a pandas DataFrame
"""
return get_speakers_dataframe(self, selector, exclude_meta)

def iter_users(self, selector=lambda speaker: True):
Expand Down Expand Up @@ -494,11 +496,11 @@ def get_usernames(self, selector: Optional[Callable[[Speaker], bool]] = lambda u

def filter_conversations_by(self, selector: Callable[[Conversation], bool]):
"""
Mutate the corpus by filtering for a subset of Conversations within the Corpus.
Mutate the corpus by filtering for a subset of Conversations within the Corpus.

:param selector: function for selecting which Conversations to keep
:return: the mutated Corpus
"""
:param selector: function for selecting which Conversations to keep
:return: the mutated Corpus
"""

self.conversations = {convo_id: convo for convo_id, convo in self.conversations.items() if selector(convo)}
utt_ids = set([utt for convo in self.conversations.values() for utt in convo.get_utterance_ids()])
Expand Down Expand Up @@ -1269,6 +1271,64 @@ def get_full_attribute_table(self, speaker_convo_attrs, speaker_attrs=None, conv
c_df.columns = [x + convo_suffix for x in c_df.columns]
return uc_df.join(u_df, on='speaker').join(c_df, on='convo_id')

@staticmethod
def from_pandas(speakers_df: DataFrame, utterances_df: DataFrame, conversations_df: DataFrame) -> 'Corpus':

"""
Generates a Corpus from speakers, utterances and conversations dataframes.
If the 'id' column is absent, the dataframe index will be used as the id.

Arguments:
speakers_df {DataFrame} -- speakers data, in a pandas Dataframe, with metadata optional
utterances_df {DataFrame} -- utterances data, in a pandas Dataframe. All primary data fields expected, with metadata optional
conversations_df {DataFrame} -- conversations data, in a pandas Dataframe, with metadata optional

Returns:
Corpus -- the generated corpus
"""
#dict containing all primary fields expected in utterance dataframe
columns = ['speaker', 'id', 'timestamp', 'conversation_id', 'reply_to', 'text']

for (df_type, df) in [('utterances', utterances_df), ('conversations', conversations_df),
('speakers', speakers_df)]:
if 'id' not in df.columns:
print(f'ID column is not present in {df_type} dataframe, generated ID column from dataframe index...')
df['id'] = df.index

#checking if dataframes contain their respective required columns
assert pd.Series(columns).isin(utterances_df.columns).all(), "Utterances dataframe must contain all primary data fields"

utterance_meta_cols = extract_meta_from_df(utterances_df)
speaker_meta_cols = extract_meta_from_df(speakers_df)
speakers_df.columns = [col.replace('meta.', '') for col in speakers_df.columns]

utterance_list = []
for index, row in tqdm(utterances_df.iterrows()):

# extracting utterance metadata
if utterance_meta_cols:
metadata = {}
for meta_col in utterance_meta_cols:
metadata[meta_col] = row['meta.' + meta_col]
else:
metadata = None

# extracting speaker metadata from speakers_df
speaker_meta = speakers_df[speakers_df['id'] == row['speaker']][speaker_meta_cols].to_dict(orient='records')[0] if speaker_meta_cols else None

# adding utterance in utterance list
utterance_list.append(Utterance(id=str(row['id']), speaker=Speaker(id=row['speaker'], meta=speaker_meta),
conversation_id=row['conversation_id'], reply_to=row['reply_to'],
timestamp=row['timestamp'], text=row['text'],
meta=metadata))
# initializing corpus using utterance_list
corpus_from_pandas = Corpus(utterances=utterance_list)

# updating conversation metadata in corpus
corpus_from_pandas = add_conv_meta_df(conversations_df, corpus_from_pandas)

return corpus_from_pandas

# def __repr__(self):
# def __eq__(self, other):
# return True
17 changes: 16 additions & 1 deletion convokit/model/corpusHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,4 +358,19 @@ def dump_jsonlist_from_dict(entries, filename, index_key='id', value_key='value'
with open(filename, 'w') as f:
for k, v in entries.items():
json.dump({index_key: k, value_key: v}, f)
f.write('\n')
f.write('\n')

def extract_meta_from_df(df):
meta_cols = [col.split(".")[1] for col in df if col.startswith('meta')]
return meta_cols

def add_conv_meta_df(conversations_df, corpus):
conv_meta_cols = extract_meta_from_df(conversations_df)
conversations_df.columns = [col.replace('meta.', '') for col in conversations_df.columns]
for convo in corpus.iter_conversations():
# get the conv_id for the conversation by checking from utterance info
convo_id = convo.get_id()
conversation_meta = conversations_df[conversations_df['id'] == convo_id][conv_meta_cols].to_dict(orient='records')[0] if conv_meta_cols else None
convo.meta.update(conversation_meta)

return corpus
32 changes: 32 additions & 0 deletions convokit/tests/general/test_from_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import unittest
from convokit.model import Utterance, Speaker, Corpus
from convokit import download

class CorpusFromPandas(unittest.TestCase):

def setUp(self) -> None:
self.corpus = Corpus(download('subreddit-hey'))
utt_df = self.corpus.get_utterances_dataframe()
convo_df = self.corpus.get_conversations_dataframe()
speaker_df = self.corpus.get_speakers_dataframe()
self.new_corpus = Corpus.from_pandas(speaker_df, utt_df, convo_df)

def test_reconstruction_stats(self):
"""
Test that reconstructing the Corpus from outputted dataframes results in the same number of corpus components
"""
assert len(self.new_corpus.speakers) == len(self.corpus.speakers)
assert len(self.new_corpus.conversations) == len(self.corpus.conversations)
assert len(self.new_corpus.utterances) == len(self.corpus.utterances)

def test_reconstruction_metadata(self):
assert set(self.corpus.random_utterance().meta) == set(self.new_corpus.random_utterance().meta)
assert set(self.corpus.random_conversation().meta) == set(self.new_corpus.random_conversation().meta)
assert set(self.corpus.random_speaker().meta) == set(self.new_corpus.random_speaker().meta)

def test_convo_reconstruction(self):
for convo in self.new_corpus.iter_conversations():
assert convo.check_integrity(verbose=False)

if __name__ == '__main__':
unittest.main()
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ nltk>=3.4
dill>=0.2.9
torch>=1.2.0
clean-text>=0.1.1
joblib>=0.13.2
joblib>=0.13.2
tqdm