-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathutils.py
419 lines (355 loc) · 16.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import os
import pathlib
import random
import json
import numpy as np
from scipy import linalg
import sklearn.cluster
import torch
import transformers
from transformers import (
AutoTokenizer,
AutoModel
)
try:
from tqdm import tqdm
except ImportError:
# If tqdm is not available, provide a mock version of it
def tqdm(x):
return x
# ----- tools about configuring model and loading data ----- #
def read_data(file_path):
querys, answers = [], []
with open(file_path, 'r', encoding='utf-8') as fin:
for line in fin:
qa = line.split('\t')
query, answer = qa[:2] if len(qa) > 1 else (qa[0], '')
querys.append(query.strip())
answers.append(answer.strip())
return querys, answers
def get_model_configs(pretrained_model_path, is_chinese=False):
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path)
model = AutoModel.from_pretrained(pretrained_model_path, return_dict=True)
return tokenizer, model
def get_embeddings(querys, answers, tokenizer, model, batch_size, use_cuda=True):
feats = []
model.eval()
if use_cuda:
model.to('cuda')
with torch.no_grad():
num_batches = len(querys) // batch_size
if len(querys) % batch_size > 0:
num_batches += 1
for i in tqdm(range(num_batches)):
query = querys[i*batch_size : (i+1)*batch_size]
answer = answers[i*batch_size : (i+1)*batch_size]
if len(query)== 0 or len(answer) == 0:
continue
inputs = tokenizer(query, answer, return_tensors='pt', padding=True)
if use_cuda:
inputs.to('cuda')
outputs = model(**inputs)
feats.append(outputs.last_hidden_state[:, 0].cpu().data)
feats = torch.cat(feats).numpy()
return feats
# ----- details on calculating FBD ----- #
def calculate_feature_statistics(feats):
"""Calculation of the statistics used by the FID.
Params:
-- feats : tensor of features with the shape [N, D]
Returns:
-- mu : The mean over samples of the activations of the pool_3 layer of
the inception model.
-- sigma : The covariance matrix of the activations of the pool_3 layer of
the inception model.
"""
mu = np.mean(feats, axis=0)
sigma = np.cov(feats, rowvar=False)
return mu, sigma
def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
"""Numpy implementation of the Frechet Distance.
The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
and X_2 ~ N(mu_2, C_2) is
d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
Stable version by Dougal J. Sutherland.
Params:
-- mu1 : Numpy array containing the activations of a layer of the
inception net (like returned by the function 'get_predictions')
for generated samples.
-- mu2 : The sample mean over activations, precalculated on an
representative data set.
-- sigma1: The covariance matrix over activations for generated samples.
-- sigma2: The covariance matrix over activations, precalculated on an
representative data set.
Returns:
-- : The Frechet Distance.
"""
mu1 = np.atleast_1d(mu1)
mu2 = np.atleast_1d(mu2)
sigma1 = np.atleast_2d(sigma1)
sigma2 = np.atleast_2d(sigma2)
assert mu1.shape == mu2.shape, \
'Training and test mean vectors have different lengths'
assert sigma1.shape == sigma2.shape, \
'Training and test covariances have different dimensions'
diff = mu1 - mu2
# Product might be almost singular
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
if not np.isfinite(covmean).all():
msg = ('fid calculation produces singular product; '
'adding %s to diagonal of cov estimates') % eps
print(msg)
offset = np.eye(sigma1.shape[0]) * eps
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
# Numerical error might give slight imaginary component
if np.iscomplexobj(covmean):
if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
m = np.max(np.abs(covmean.imag))
raise ValueError('Imaginary component {}'.format(m))
covmean = covmean.real
tr_covmean = np.trace(covmean)
return (diff.dot(diff) + np.trace(sigma1)
+ np.trace(sigma2) - 2 * tr_covmean)
# ----- details on calculating PRD ----- #
def compute_prd(eval_dist, ref_dist, num_angles=1001, epsilon=1e-10):
"""Computes the PRD curve for discrete distributions.
This function computes the PRD curve for the discrete distribution eval_dist
with respect to the reference distribution ref_dist. This implements the
algorithm in [arxiv.org/abs/1806.2281349]. The PRD will be computed for an
equiangular grid of num_angles values between [0, pi/2].
Args:
eval_dist: 1D NumPy array or list of floats with the probabilities of the
different states under the distribution to be evaluated.
ref_dist: 1D NumPy array or list of floats with the probabilities of the
different states under the reference distribution.
num_angles: Number of angles for which to compute PRD. Must be in [3, 1e6].
The default value is 1001.
epsilon: Angle for PRD computation in the edge cases 0 and pi/2. The PRD
will be computes for epsilon and pi/2-epsilon, respectively.
The default value is 1e-10.
Returns:
precision: NumPy array of shape [num_angles] with the precision for the
different ratios.
recall: NumPy array of shape [num_angles] with the recall for the different
ratios.
Raises:
ValueError: If not 0 < epsilon <= 0.1.
ValueError: If num_angles < 3.
"""
if not (epsilon > 0 and epsilon < 0.1):
raise ValueError('epsilon must be in (0, 0.1] but is %s.' % str(epsilon))
if not (num_angles >= 3 and num_angles <= 1e6):
raise ValueError('num_angles must be in [3, 1e6] but is %d.' % num_angles)
# Compute slopes for linearly spaced angles between [0, pi/2]
angles = np.linspace(epsilon, np.pi / 2 - epsilon, num=num_angles)
slopes = np.tan(angles)
# Broadcast slopes so that second dimension will be states of the distribution
slopes_2d = np.expand_dims(slopes, 1)
# Broadcast distributions so that first dimension represents the angles
ref_dist_2d = np.expand_dims(ref_dist, 0)
eval_dist_2d = np.expand_dims(eval_dist, 0)
# Compute precision and recall for all angles in one step via broadcasting
precision = np.minimum(ref_dist_2d * slopes_2d, eval_dist_2d).sum(axis=1)
recall = precision / slopes
# handle numerical instabilities leaing to precision/recall just above 1
max_val = max(np.max(precision), np.max(recall))
if max_val > 1.001:
raise ValueError('Detected value > 1.001, this should not happen.')
precision = np.clip(precision, 0, 1)
recall = np.clip(recall, 0, 1)
return precision, recall
def _prd_to_f_beta(precision, recall, beta=1, epsilon=1e-10):
"""Computes F_beta scores for the given precision/recall values.
The F_beta scores for all precision/recall pairs will be computed and
returned.
For precision p and recall r, the F_beta score is defined as:
F_beta = (1 + beta^2) * (p * r) / ((beta^2 * p) + r)
Args:
precision: 1D NumPy array of precision values in [0, 1].
recall: 1D NumPy array of precision values in [0, 1].
beta: Beta parameter. Must be positive. The default value is 1.
epsilon: Small constant to avoid numerical instability caused by division
by 0 when precision and recall are close to zero.
Returns:
NumPy array of same shape as precision and recall with the F_beta scores for
each pair of precision/recall.
Raises:
ValueError: If any value in precision or recall is outside of [0, 1].
ValueError: If beta is not positive.
"""
if not ((precision >= 0).all() and (precision <= 1).all()):
raise ValueError('All values in precision must be in [0, 1].')
if not ((recall >= 0).all() and (recall <= 1).all()):
raise ValueError('All values in recall must be in [0, 1].')
if beta <= 0:
raise ValueError('Given parameter beta %s must be positive.' % str(beta))
return (1 + beta**2) * (precision * recall) / (
(beta**2 * precision) + recall + epsilon)
def prd_to_max_f_beta_pair(precision, recall, beta=8):
"""Computes max. F_beta and max. F_{1/beta} for precision/recall pairs.
Computes the maximum F_beta and maximum F_{1/beta} score over all pairs of
precision/recall values. This is useful to compress a PRD plot into a single
pair of values which correlate with precision and recall.
For precision p and recall r, the F_beta score is defined as:
F_beta = (1 + beta^2) * (p * r) / ((beta^2 * p) + r)
Args:
precision: 1D NumPy array or list of precision values in [0, 1].
recall: 1D NumPy array or list of precision values in [0, 1].
beta: Beta parameter. Must be positive. The default value is 8.
Returns:
f_beta: Maximum F_beta score.
f_beta_inv: Maximum F_{1/beta} score.
Raises:
ValueError: If beta is not positive.
"""
if not ((precision >= 0).all() and (precision <= 1).all()):
raise ValueError('All values in precision must be in [0, 1].')
if not ((recall >= 0).all() and (recall <= 1).all()):
raise ValueError('All values in recall must be in [0, 1].')
if beta <= 0:
raise ValueError('Given parameter beta %s must be positive.' % str(beta))
f_beta = np.max(_prd_to_f_beta(precision, recall, beta))
f_beta_inv = np.max(_prd_to_f_beta(precision, recall, 1/beta))
return f_beta, f_beta_inv
def _cluster_into_bins(eval_data, ref_data, num_clusters):
"""Clusters the union of the data points and returns the cluster distribution.
Clusters the union of eval_data and ref_data into num_clusters using minibatch
k-means. Then, for each cluster, it computes the number of points from
eval_data and ref_data.
Args:
eval_data: NumPy array of data points from the distribution to be evaluated.
ref_data: NumPy array of data points from the reference distribution.
num_clusters: Number of cluster centers to fit.
Returns:
Two NumPy arrays, each of size num_clusters, where i-th entry represents the
number of points assigned to the i-th cluster.
"""
cluster_data = np.vstack([eval_data, ref_data])
kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=num_clusters, n_init=10)
labels = kmeans.fit(cluster_data).labels_
eval_labels = labels[:len(eval_data)]
ref_labels = labels[len(eval_data):]
eval_bins = np.histogram(eval_labels, bins=num_clusters,
range=[0, num_clusters], density=True)[0]
ref_bins = np.histogram(ref_labels, bins=num_clusters,
range=[0, num_clusters], density=True)[0]
return eval_bins, ref_bins
def compute_prd_from_embedding(
eval_data,
ref_data,
num_clusters=20,
num_angles=1001,
num_runs=10,
enforce_balance=True
):
"""Computes PRD data from sample embeddings.
The points from both distributions are mixed and then clustered. This leads
to a pair of histograms of discrete distributions over the cluster centers
on which the PRD algorithm is executed.
The number of points in eval_data and ref_data must be equal since
unbalanced distributions bias the clustering towards the larger dataset. The
check can be disabled by setting the enforce_balance flag to False (not
recommended).
Args:
eval_data: NumPy array of data points from the distribution to be evaluated.
ref_data: NumPy array of data points from the reference distribution.
num_clusters: Number of cluster centers to fit. The default value is 20.
num_angles: Number of angles for which to compute PRD. Must be in [3, 1e6].
The default value is 1001.
num_runs: Number of independent runs over which to average the PRD data.
enforce_balance: If enabled, throws exception if eval_data and ref_data do
not have the same length. The default value is True.
Returns:
precision: NumPy array of shape [num_angles] with the precision for the
different ratios.
recall: NumPy array of shape [num_angles] with the recall for the different
ratios.
Raises:
ValueError: If len(eval_data) != len(ref_data) and enforce_balance is set to
True.
"""
if enforce_balance and len(eval_data) != len(ref_data):
raise ValueError(
'The number of points in eval_data %d is not equal to the number of '
'points in ref_data %d. To disable this exception, set enforce_balance '
'to False (not recommended).' % (len(eval_data), len(ref_data)))
eval_data = np.array(eval_data, dtype=np.float64)
ref_data = np.array(ref_data, dtype=np.float64)
precisions = []
recalls = []
for _ in range(num_runs):
eval_dist, ref_dist = _cluster_into_bins(eval_data, ref_data, num_clusters)
precision, recall = compute_prd(eval_dist, ref_dist, num_angles)
precisions.append(precision)
recalls.append(recall)
precision = np.mean(precisions, axis=0)
recall = np.mean(recalls, axis=0)
return precision, recall
# ----- details on data transformation ----- #
def read_vocab(file):
vocab = []
with open(file, 'r', encoding='utf-8') as f:
for line in f:
line = line.split()
vocab.append(line[0])
return vocab
def read_dict(file):
with open(file, 'r', encoding='utf-8') as f:
res = [line.strip() for line in f]
return res
def read_dialogue(path):
querys = []
refs = []
hyps = []
human_scores = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
querys.append(line['src'])
refs.append(line['refs'][0])
for i, hyp in enumerate(line['hyps']):
if len(hyps) < i + 1:
hyps.append([])
hyps[i].append(hyp)
for i, scores in enumerate(line['human_scores']):
if len(human_scores) < i + 1:
human_scores.append([])
for j, score in enumerate(scores):
if len(human_scores[i]) < j + 1:
human_scores[i].append([])
human_scores[i][j].append(score)
return querys, refs, hyps, human_scores
def transform_qa_pairs(querys, answers, transform, ratio, noise_dict, repeat_dict):
trans_answers = list(answers)
trans_indexs = random.sample(list(range(len(querys))), int(len(querys) * ratio))
if transform == 'noise':
assert noise_dict != None
vocab = read_vocab(noise_dict)
for trans_index in trans_indexs:
trans_answer = answers[trans_index].split()
num_lower = len(trans_answer) // 4
num_upper = max(len(trans_answer) // 2 + 1, len(trans_answer) // 4 + 2)
num_list = list(range(num_lower, num_upper))
num = random.choice(num_list)
for _ in range(num):
loc = random.randint(0, len(trans_answer))
word = random.choice(vocab)
trans_answer.insert(loc, word)
trans_answers[trans_index] = ' '.join(trans_answer)
elif transform == 'mismatch':
indexs = sorted(trans_indexs)
for index, trans_index in zip(indexs, trans_indexs):
trans_answers[index] = answers[trans_index]
elif transform == 'permutate':
for trans_index in trans_indexs:
trans_answer = answers[trans_index].split()
random.shuffle(trans_answer)
trans_answers[trans_index] = ' '.join(trans_answer)
elif transform == 'repeat':
assert repeat_dict != None
repeat_dict = read_dict(repeat_dict)
for trans_index in trans_indexs:
trans_answers[trans_index] = random.choice(repeat_dict)
else:
raise RuntimeError('Unknown transformation: {}'.format(transform))
return querys, trans_answers