-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_politihop.py
111 lines (90 loc) · 3.91 KB
/
preprocess_politihop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# preprocess the PolitiHop dataset
import os
import pickle
import pandas as pd
import ast
import tiktoken
from utils import *
train_path = "data/PolitiHop_data/politihop_train.tsv"
valid_path = "data/PolitiHop_data/politihop_valid.tsv"
test_path = "data/PolitiHop_data/politihop_test.tsv"
# using pandas
train_df = pd.read_csv(train_path, sep="\t")
valid_df = pd.read_csv(valid_path, sep="\t")
test_df = pd.read_csv(test_path, sep="\t")
train_df["annotated_evidence"] = train_df["annotated_evidence"].apply(eval)
valid_df["annotated_evidence"] = valid_df["annotated_evidence"].apply(eval)
test_df["annotated_evidence"] = test_df["annotated_evidence"].apply(eval)
# merge train and dev instances
def merge_dicts(dicts):
merged_dict = {}
for d in dicts:
merged_dict.update(d)
return merged_dict
aggregation = {"article_id": "first", 'author': "first",
'ruling': "first", 'url_sentences': "first", 'relevant_text_url_sentences': "first",
'politifact_label': "first", 'annotated_label': "first", 'urls': "first",
'annotated_urls': "first", "annotated_evidence": merge_dicts}
m_train_df = train_df.groupby("statement").agg(aggregation).reset_index()
m_valid_df = valid_df.groupby("statement").agg(aggregation).reset_index()
# Preprocessing
original_data = pd.concat([train_df, valid_df, test_df], axis=0, ignore_index=True)
dataset = pd.concat([m_train_df, m_valid_df, test_df], axis=0, ignore_index=True)
def get_unique_chains(chains):
"""
Since we don't have so many instances, we just remove duplicate chains in the evidence set
"""
sets = set(map(tuple, list(chains.values())))
unique_chains = {str(i): list(c) for i, c in enumerate(sets)}
return unique_chains
def format_one_ins(sample):
""" format sample as input to GPT, return an input_dict
input_dict={"article_id":sample['article_id'],
"chain_dict":chain_dict,
"prompt_dict":prompt_dict}
prompt_dict={'chain_id':<prompt of the chain>}
each item in prompt_dict contains one standalone reasoning chain
"""
CLAIM = f"Claim: {sample['statement']}"
VERACITY = f"Veracity: {sample['annotated_label']}"
# original reasoning chain dict {'chain_id': [list of evidence sentence number]}
chains = sample["annotated_evidence"]
# remove duplicate chains
chain_dict = get_unique_chains(chains)
# TEST original chain
# unique_chains=chains
ruling = ast.literal_eval(sample["ruling"])
PASSAGE = "\n".join([str(i) + ": " + sen for i, sen in enumerate(ruling)])
PRE = f"{gpt_prompt_core}\n{CLAIM}\n{VERACITY}\nReasoning chain:\n"
prompt_dict = {} # {'chain_id': prompt for this chain}
for k, v in chain_dict.items():
CHAIN = f"chain {k}:\n"
new_v = []
for i in v:
if i.isdigit():
# print(i) # DEBUG
CHAIN += str(i) + ': ' + ruling[int(i)]
else:
i = int(i.split(",")[0])
CHAIN += str(i) + ': ' + ruling[i]
CHAIN += '\n'
new_v.append(str(i))
chain_dict.update({k: new_v})
# each PROMPT contains one standalone textual CHAIN
prompt_dict[k] = PRE + CHAIN
input_dict = {'article_id': sample['article_id'],
'claim': sample['statement'],
'label': sample['annotated_label'],
'passage': PASSAGE,
'chain_dict': chain_dict,
'prompt_dict': prompt_dict}
return input_dict
if __name__ == '__main__':
merged_dataset = [format_one_ins(dataset.iloc[i]) for i in range(len(dataset))]
# save merged dataset to file
processed_path = 'data/TransExp_data'
if not os.path.exists(processed_path):
os.mkdir(processed_path)
file_path = os.path.join(processed_path, 'merged_data.pickle')
with open(file_path, 'wb') as file:
pickle.dump(merged_dataset, file)