-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_cv_error.py
executable file
·406 lines (384 loc) · 19.2 KB
/
get_cv_error.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
#!/usr/bin/env python
"""Compute the cross-validation error of a given model on a dataset
Copyright © 2020 Max Veit.
This code is licensed under the GPL, Version 3; see the LICENSE file for
more details.
"""
import argparse
import logging
import os
import sys
import ase.io
import numpy as np
from scipy import optimize
from velociraptor import fitutils
from velociraptor.fitutils import get_charges, get_dipoles
from velociraptor.kerneltools import make_kernel_params
from velociraptor.kerneltools import compute_residual as kt_residual
LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(
description="Compute the cross-validation (CV) error of the model "
"with specified kernel parameters on a dataset",
epilog="See 'do_fit.py' for more explanation of the fitting options.")
parser.add_argument(
'geometries', help="Geometries of the molecules in the fit; should be the "
"name of a file readable by ASE.")
parser.add_argument(
'dipoles', help="Dipoles, in Cartesian coordinates, per geometry. "
"Entries must be in the same order as the geometry file. "
"Alternatively, with -dg, read them from the geometry file itself,"
" in which case this argument is the name of the key in the "
"atoms.info dict where the dipole is stored.")
parser.add_argument(
'-dg', '--dipoles-in-geomfile', action='store_true', help="Read the "
"dipoles from the geometry file instead, from the atoms.info key "
"given by the 'dipoles' argument.")
# Kernel parameters
#TODO package these into their own sub-parser?
# Really, reorganizing this whole script into subcommands would be
# much cleaner (e.g. compute_kernels, optimize, ...)
parser.add_argument(
'-n', '--max-radial', type=int, metavar='N', help="Number of radial "
"channels for the SOAP kernel", default=8)
parser.add_argument(
'-l', '--max-angular', type=int, metavar='L', help="Maximum angular "
"momentum number for the SOAP kernel", default=6)
parser.add_argument(
'-nsf', '--num-sparse-features', type=int, metavar='N_F', help="Sparsify "
"to this many feature vector components", default=-1)
parser.add_argument(
'-nse', '--num-sparse-environments', type=int, metavar='N_E',
help="Sparsify to this many unique environments", default=-1)
parser.add_argument(
'-aw', '--atom-sigma', type=float, metavar='sigma', help="Width of the "
"atomic Gaussian smearing", default=0.3)
parser.add_argument(
'-c', '--cutoff', type=float, metavar='r_c', help="Spherical cutoff of "
"the representation", default=5.0)
parser.add_argument(
'-r0', '--radial-scaling-scale', type=float, metavar='r_0', help="Scale "
"parameter for the radial scaling function")
parser.add_argument(
'-rm', '--radial-scaling-power', type=float, metavar='m', help="Scaling "
"exponent for the radial scaling function")
parser.add_argument(
'-ws', '--scalar-weight', type=float, metavar='weight',
help="Weight of the scalar component (charges) in the model",
required=True)
parser.add_argument(
'-wt', '--vector-weight', type=float, metavar='weight',
help="Weight of the vector component (point dipoles) in the model",
required=True)
parser.add_argument(
'-rc', '--charge-regularization', type=float, default=1.0,
metavar='sigma_q', help="Regularization coefficient (sigma) "
"for total charges")
parser.add_argument(
'-rd', '--dipole-regularization', type=float, required=True,
metavar='sigma_mu', help="Regularization coefficient (sigma) "
"for dipole components")
parser.add_argument(
'-k', '--cv-num-partitions', type=int, metavar='k', help="Do k-fold cross "
"validation, i.e. split the dataset into k randomly-chosen and "
"more-or-less equal partitions and do cross validation with them",
default=4)
parser.add_argument(
'-cvf', '--cv-file', type=str, help="Use CV-indices from the given file, "
"which must be a NumPy array with 'k' sets of indices over the "
"first axis, denoting the points to be withheld as the test set. "
"If the file does not exist, it will be created as described "
"for 'k'.")
parser.add_argument(
'-orc', '--optimize-charge-reg', action='store_true', help="Optimize the "
"CV-error w.r.t. the charge regularization?")
parser.add_argument(
'-ord', '--optimize-dipole-reg', action='store_true', help="Optimize the "
"CV-error w.r.t. the dipole regularization?")
parser.add_argument(
'-owt', '--optimize-weights', action='store_true', help="Optimize the "
"CV-error w.r.t. the scalar and vector weights? (Recommended "
"use with '-orc', but _not_ '-ord' or '-opt')")
parser.add_argument(
'-opt', '--optimize-kparams', action='store_true', help="Optimize the "
"CV-error w.r.t. all of the numerical, non-integer kernel "
"parameters?")
parser.add_argument(
'-kis', '--kparams-init-simplex', help="File containing the initial "
"simplex for the Nelder-Mead optimization of kernel parameters "
"(should be a 4x3 matrix, in NumPy or text format).")
parser.add_argument(
'-kr', '--recompute-kernels', action='store_true', help="Recompute the "
"kernel, even if it is not required by an optimization (i.e. if "
"kernel hypers are not being optimized)")
parser.add_argument(
'-nt', '--num-training-geometries', type=int, nargs='*', metavar='<n>',
help="Keep only the first <n> geometries for training. "
"Specify multiple values to do a learning curve.")
parser.add_argument(
'-pr', '--print-residuals', action='store_true', help="Print the RMSE "
"residuals of the model evaluated on its own training data")
parser.add_argument(
'-wr', '--write-residuals', action='store_true', help="Write the "
"individual (non-RMSed) residuals to a file called "
"'cv_<n>_residuals.npz' in the working directory.")
parser.add_argument(
'-wd', '--working-directory', metavar='DIR', help="Working directory for "
"power spectrum and kernel computations (geometry and dipole files"
" are interpreted relative to this directory if paths are not "
"otherwise specified", default='.')
parser.add_argument(
'-p', '--n-processes', metavar='N', type=int, help="Number of parallel "
"processes to use for computing the power spectrum (default is to "
"let slurm decide)")
def make_cv_sets(n_geoms, cv_num_partitions):
idces_perm = np.random.permutation(n_geoms)
idces_split = np.array_split(idces_perm, cv_num_partitions)
return [np.sort(idces_set) for idces_set in idces_split]
def load_detect_matrix(filename):
fext = os.path.splitext(filename)[1]
if (fext == '.npy') or (fext == '.npz'):
matrix = np.load(filename)
elif (fext == '.txt') or (fext == '.dat'):
matrix = np.loadtxt(filename)
else:
logger.warning("Matrix file {:s} has unrecognized or missing filename "
"extension; assuming plain text.".format(filename))
matrix = np.loadtxt(filename)
return matrix
def optimize_hypers(kparams_initial, dipole_reg_initial, charge_reg_initial,
scalar_weight, vector_weight,
optimize_kparams, optimize_dreg, optimize_creg,
optimize_weights,
geometries, dipoles, workdir, cv_idces_sets,
dipole_normalize=True, init_simplex_file=None,
penalize_small_atomwidth=True):
if cv_idces_sets is None:
LOGGER.error("Requested optimization with no cross-validation. "
"This is almost certainly a bad idea.")
if optimize_kparams:
# For vector-only kernels, the charge regularization is ignored --
# so don't try to optimize it!
if scalar_weight == 0:
optimize_dreg = True
optimize_creg = False
else:
optimize_dreg = True
optimize_creg = True
if optimize_weights:
if optimize_kparams:
LOGGER.error("Optimizing scalar/vector weights together with the "
"kernel hypers is not implemented and certainly not "
"recommended. ")
sys.exit(1)
if optimize_dreg:
LOGGER.warning("Dipole regularizer should not be optimized "
"together with scalar and vector weights, as there "
"are only two degrees of freedom between the three "
"parameters. Continuing with only the scalar/"
"vector weights to be optimized.")
optimize_dreg = False
# Note: Optimizing charge reg also is recommended but not enforced
# Optimizing kernel params implies optimizing regularizers
#TODO redo this without weird functional closures and shadowing
def optimize_regularizers(dipole_reg_initial, charge_reg_initial):
def objective_no_recompute(dipole_reg_log, charge_reg_log):
return kt_residual(10**dipole_reg_log, 10**charge_reg_log,
scalar_weight, vector_weight,
workdir, geometries, dipoles,
dipole_normalize, True, False, None,
cv_idces_sets, write_results=False,
print_results=False)
final_reg = np.array([dipole_reg_initial, charge_reg_initial])
if not optimize_creg:
charge_reg_log = np.log10(charge_reg_initial)
result_1d = lambda x: objective_no_recompute(x, charge_reg_log)
opt_result = optimize.minimize_scalar(result_1d)
final_reg[0] = 10**opt_result.x
elif not optimize_dreg:
dipole_reg_log = np.log10(dipole_reg_initial)
result_1d = lambda x: objective_no_recompute(dipole_reg_log, x)
opt_result = optimize.minimize_scalar(result_1d)
final_reg[1] = 10**opt_result.x
else:
result_2d = lambda x: objective_no_recompute(*x)
opt_result = optimize.minimize(
result_2d, (np.log10(dipole_reg_initial),
np.log10(charge_reg_initial)),
method='Nelder-Mead',
options=dict(maxiter=100, xatol=1e-2))
final_reg = 10**opt_result.x
print(opt_result)
print("Final regularizer: " + np.array_str(final_reg, precision=6))
return final_reg, opt_result.fun
def optimize_scalar_vector_weights(scalar_weight_initial,
vector_weight_initial):
def objective_no_recompute(weights_log):
scalar_weight, vector_weight, charge_reg = 10**weights_log
return kt_residual(dipole_reg_initial, charge_reg,
scalar_weight, vector_weight,
workdir, geometries, dipoles,
dipole_normalize, True, False, None,
cv_idces_sets, write_results=False,
print_results=False)
if not optimize_creg:
objective = lambda x: objective_no_recompute(
np.concatenate((x, np.log10(charge_reg_initial))))
initial_weights = np.array([scalar_weight_initial,
vector_weight_initial])
else:
objective = objective_no_recompute
initial_weights = np.array([scalar_weight_initial,
vector_weight_initial,
charge_reg_initial])
opt_result = optimize.minimize(
objective_no_recompute, np.log10(initial_weights),
method='Nelder-Mead', options=dict(maxiter=100, xatol=1e-2))
final_weights = 10**opt_result.x
print(opt_result)
if optimize_creg:
message = "Final {scalar, vector} weights and charge reg: "
else:
message = "Final {scalar, vector} weights: "
print(message + np.array_str(final_weights, precision=6))
return final_weights, opt_result.fun
final_rel_weights = np.array([scalar_weight, vector_weight])
if optimize_kparams:
dipole_reg_last = dipole_reg_initial
charge_reg_last = charge_reg_initial
def objective(test_params):
LOGGER.info("Trying params: " + str(test_params))
(atom_width, rad_r0, rad_m) = test_params
if penalize_small_atomwidth and (atom_width < 0.2):
return 10000 # hackish, but it should work
kparams = dict(kparams_initial)
kparams['atom_width'] = atom_width
kparams['rad_r0'] = rad_r0
kparams['rad_m'] = rad_m
nonlocal dipole_reg_last
nonlocal charge_reg_last
resid_first = kt_residual(
dipole_reg_last, charge_reg_last, scalar_weight,
vector_weight, workdir, geometries, dipoles,
dipole_normalize, True, True, kparams, cv_idces_sets,
write_results=False, print_results=False)
final_reg, result = optimize_regularizers(
dipole_reg_last, charge_reg_last)
dipole_reg_last, charge_reg_last = final_reg
return result
final_reg = np.array([dipole_reg_initial, charge_reg_initial])
initial_params = np.array((kparams_initial['atom_width'],
kparams_initial['rad_r0'],
kparams_initial['rad_m']))
optimizer_options = dict(maxiter=100, xatol=1e-2)
if init_simplex_file is not None:
optimizer_options['initial_simplex'] = load_detect_matrix(
init_simplex_file)
opt_result = optimize.minimize(
objective, initial_params, method='Nelder-Mead',
options=optimizer_options)
print(opt_result)
final_params = opt_result.x
final_reg = np.array([dipole_reg_last, charge_reg_last])
print("Final kernel parameters: " + np.array_str(final_params,
precision=6))
print("Final (final) regularizer: " + np.array_str(final_reg,
precision=6))
kparams_initial['atom_width'] = final_params[0]
kparams_initial['rad_r0'] = final_params[1]
kparams_initial['rad_m'] = final_params[2]
elif optimize_weights:
final_weights, fval = optimize_scalar_vector_weights(scalar_weight,
vector_weight)
final_rel_weights = final_weights[:2]
final_reg = np.array([dipole_reg_initial, charge_reg_initial])
if optimize_creg:
final_reg[1] = final_weights[2]
elif optimize_dreg or optimize_creg:
final_reg, fval = optimize_regularizers(
dipole_reg_initial, charge_reg_initial)
else:
LOGGER.error("No optimization requested.")
return None
return kparams_initial, final_reg, final_rel_weights
if __name__ == "__main__":
args = parser.parse_args()
if args.n_processes is not None:
kerneltools.NCPUS = args.n_processes
if not os.path.dirname(args.geometries):
geom_filename = os.path.join(args.working_directory, args.geometries)
else:
geom_filename = args.geometries
del args.geometries
if args.num_training_geometries:
if len(args.num_training_geometries) > 1:
raise NotImplementedError("Learning curves not yet implemented")
else:
n_train = args.num_training_geometries[0]
geometries = ase.io.read(geom_filename, slice(0, n_train))
fname_split = os.path.splitext(geom_filename)
geom_filename = "{:s}_nt{:d}{:s}".format(
fname_split[0], n_train, fname_split[1])
ase.io.write(geom_filename, geometries)
else:
geometries = ase.io.read(geom_filename, slice(None))
n_train = len(geometries)
charges = get_charges(geometries)
if args.dipoles_in_geomfile:
dipoles = get_dipoles(geometries, args.dipoles)
else:
if not os.path.dirname(args.dipoles):
dipole_filename = os.path.join(
args.working_directory, args.dipoles)
else:
dipole_filename = args.dipoles
dipoles = load_detect_matrix(dipole_filename)[:n_train]
natoms_list = [len(geom) for geom in geometries]
dipole_normalize = True # seems to be the best option
if args.cv_file is not None:
try:
cv_idces_sets = np.load(args.cv_file)
if len(cv_idces_sets) != args.cv_num_partitions:
LOGGER.warning(
"Different number of CV-partitions specified in the file "
"({:d}) and on the command line ({:d}); using the file "
"setting", len(cv_idces_sets), args.cv_num_partitions)
except FileNotFoundError:
LOGGER.info("CV-file not found, creating new CV-partition...")
cv_idces_sets = make_cv_sets(n_train, args.cv_num_partitions)
np.save(args.cv_file, cv_idces_sets)
else:
cv_idces_sets = make_cv_sets(n_train, args.cv_num_partitions)
kparams = make_kernel_params(
args.max_radial, args.max_angular, args.atom_sigma,
args.radial_scaling_scale, args.radial_scaling_power, geom_filename,
None, args.num_sparse_environments, args.num_sparse_features)
result = kt_residual(
args.dipole_regularization, args.charge_regularization,
args.scalar_weight, args.vector_weight, args.working_directory,
geometries, dipoles,
dipole_normalize, True, args.recompute_kernels, kparams, cv_idces_sets,
args.write_residuals) # whew!
if args.print_residuals:
print("CV-error: {:.6f} a.u. per atom".format(result))
if (args.optimize_charge_reg or args.optimize_dipole_reg
or args.optimize_kparams or args.optimize_weights):
opt_results = optimize_hypers(
kparams, args.dipole_regularization,
args.charge_regularization,
args.scalar_weight, args.vector_weight,
args.optimize_kparams, args.optimize_dipole_reg,
args.optimize_charge_reg, args.optimize_weights,
geometries, dipoles, args.working_directory, cv_idces_sets,
dipole_normalize, args.kparams_init_simplex)
kparams_final, final_reg, final_rel_weights = opt_results
# probably don't need to recompute kernel, since it'll be the last one
# computed
result = kt_residual(
final_reg[0], final_reg[1],
final_rel_weights[0], final_rel_weights[1], args.working_directory,
geometries, dipoles,
dipole_normalize, True, False, None, cv_idces_sets,
args.write_residuals)
if args.print_residuals:
print("Final CV-error: {:.6f} a.u. per atom".format(result))