-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvary_combo_rand.py
151 lines (114 loc) · 5.21 KB
/
vary_combo_rand.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from src.quere import ClosedEndedExplanationDataset, SquadExplanationDataset, OpenEndedExplanationDataset
from baselines.rep_dataset import RepDataset
from src.utils import train_linear_model, compute_ece, get_linear_results
def vary_number_prompts_random(dataset_name, llm):
if dataset_name == "BooIQ":
dataset = ClosedEndedExplanationDataset("BooIQ", llm)
elif dataset_name == "HaluEval":
dataset = ClosedEndedExplanationDataset("HaluEval", llm)
elif dataset_name == "ToxicEval":
dataset = ClosedEndedExplanationDataset("ToxicEval", llm)
elif dataset_name == "CommonsenseQA":
dataset = ClosedEndedExplanationDataset("CommonsenseQA", llm)
elif dataset_name == "WinoGrande":
dataset = ClosedEndedExplanationDataset(llm)
elif dataset_name == "squad":
dataset = SquadExplanationDataset(llm)
elif dataset_name == "nq":
dataset = OpenEndedExplanationDataset(llm)
# load random dataset
if dataset_name == "BooIQ":
random_dataset = ClosedEndedExplanationDataset("BooIQ", llm, random=True)
elif dataset_name == "HaluEval":
random_dataset = ClosedEndedExplanationDataset("HaluEval", llm, random=True)
elif dataset_name == "ToxicEval":
random_dataset = ClosedEndedExplanationDataset("ToxicEval", llm, random=True)
elif dataset_name == "CommonsenseQA":
random_dataset = ClosedEndedExplanationDataset("CommonsenseQA", llm, random=True)
elif dataset_name == "WinoGrande":
random_dataset = ClosedEndedExplanationDataset(llm, random=True)
elif dataset_name == "squad":
random_dataset = SquadExplanationDataset(llm, random=True)
elif dataset_name == "nq":
random_dataset = OpenEndedExplanationDataset(llm, random=True)
b = True
train_data, train_labels, train_log_probs = \
dataset.train_data, dataset.train_labels, dataset.train_log_probs
test_data, test_labels, test_log_probs, = \
dataset.test_data, dataset.test_labels, dataset.test_log_probs
train_logits, train_pre_conf, train_post_conf = dataset.train_logits, dataset.train_pre_confs, dataset.train_post_confs
test_logits, test_pre_conf, test_post_conf = dataset.test_logits, dataset.test_pre_confs, dataset.test_post_confs
# reshape pre_conf, post_conf, log_probs
train_pre_conf = train_pre_conf.reshape(len(train_data), -1)
test_pre_conf = test_pre_conf.reshape(len(test_data), -1)
train_post_conf = train_post_conf.reshape(len(train_data), -1)
test_post_conf = test_post_conf.reshape(len(test_data), -1)
train_log_probs = train_log_probs.reshape(len(train_data), -1)
test_log_probs = test_log_probs.reshape(len(test_data), -1)
train_random_data = random_dataset.train_data
test_random_data = random_dataset.test_data
print("train_random_data", train_random_data.shape)
print("train_data", train_data.shape)
seeds = range(20)
num_prompt_list = range(2, 8, 1)
# train_data_all = np.concatenate([train_data, train_log_probs, train_pre_conf, train_post_conf], axis=1)
# test_data_all = np.concatenate([test_data, test_log_probs, test_pre_conf, test_post_conf], axis=1)
train_data_all = train_data
test_data_all = test_data
print(train_data.shape, test_data.shape)
results = {}
random_results = {}
for s in num_prompt_list: # number of random used out of 10
results[s] = []
for seed in seeds:
# set random seed
np.random.seed(seed)
torch.manual_seed(seed)
# randomly shuffle a list of inds to select
random_idxs = np.random.permutation(train_data.shape[1])
norm_idxs = np.random.permutation(train_data.shape[1])
for num_prompts in num_prompt_list:
random_ids = random_idxs[:num_prompts]
norm_ids = norm_idxs[:num_prompts]
train_data_subset = train_data_all[:, norm_ids]
test_data_subset = test_data_all[:, norm_ids]
train_random_data_subset = train_random_data[:, random_ids]
test_random_data_subset = test_random_data[:, random_ids]
all_train_data = np.concatenate([train_data_subset, train_random_data_subset], axis=1)
all_test_data = np.concatenate([test_data_subset, test_random_data_subset], axis=1)
# train predictor
acc, f1, ece, auroc = get_linear_results(all_train_data, train_labels, all_test_data, test_labels, seed=seed, balanced=b)
results[num_prompts].append(auroc)
# plot results
import matplotlib.pyplot as plt
plt.figure()
# average over seeds and compute std
means = []
stds = []
for s in num_prompt_list:
means.append(np.mean(results[s]))
stds.append(np.std(results[s]) / np.sqrt(len(seeds)))
print(means, stds)
# plt.errorbar(num_prompt_list, means, yerr=stds)
plt.plot(num_prompt_list, means, label="Combination of Elicitation Prompts and Random Sequences")
plt.fill_between(num_prompt_list, [m - s for m, s in zip(means, stds)], [m + s for m, s in zip(means, stds)], alpha=0.2)
plt.xlabel("Number of Random Sequences", fontsize=24)
# set tick size
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylabel("AUROC", fontsize=24)
plt.legend()
# tight layout
plt.tight_layout()
plt.savefig("figs/combo_" + dataset_name + "_" + llm + ".png")
plt.savefig("figs/combo_" + dataset_name + "_" + llm + ".pdf")
if __name__ == "__main__":
# set random seed
np.random.seed(0)
torch.manual_seed(0)
llm = "llama3-70b"
vary_number_prompts_random("squad", llm)