-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
sgd.py
1139 lines (1004 loc) · 45.7 KB
/
sgd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Copy of pylearn2's sgd.py, hacked to support doing steps on
discriminator separately from the generator. Ideally this would
be accomplished using pylearn2's FixedVarDescr implementation,
but it is currently not very well supported.
"""
from __future__ import division
__authors__ = "Ian Goodfellow"
__copyright__ = "Copyright 2010-2012, Universite de Montreal"
__credits__ = ["Ian Goodfellow, David Warde-Farley"]
__license__ = "3-clause BSD"
__maintainer__ = "David Warde-Farley"
__email__ = "pylearn-dev@googlegroups"
import logging
import warnings
import numpy as np
from theano import config
from theano import function
from theano.compat.python2x import OrderedDict
from theano.gof.op import get_debug_values
from pylearn2.monitor import Monitor
from pylearn2.space import CompositeSpace, NullSpace
from pylearn2.train_extensions import TrainExtension
from pylearn2.training_algorithms.training_algorithm import TrainingAlgorithm
from pylearn2.training_algorithms.learning_rule import Momentum
from pylearn2.training_algorithms.learning_rule import MomentumAdjustor \
as LRMomentumAdjustor
from pylearn2.utils.iteration import is_stochastic, has_uniform_batch_size
from pylearn2.utils import py_integer_types, py_float_types
from pylearn2.utils import safe_zip
from pylearn2.utils import serial
from pylearn2.utils import sharedX
from pylearn2.utils.data_specs import DataSpecsMapping
from pylearn2.utils.timing import log_timing
from pylearn2.utils.rng import make_np_rng
log = logging.getLogger(__name__)
class SGD(TrainingAlgorithm):
"""
SGD = (Minibatch) Stochastic Gradient Descent.
A TrainingAlgorithm that does stochastic gradient descent on minibatches
of training examples.
For theoretical background on this algorithm, see Yoshua Bengio's machine
learning course notes on the subject:
http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html
Parameters
----------
learning_rate : float
The learning rate to use. Train object callbacks can change the
learning rate after each epoch. SGD update_callbacks can change
it after each minibatch.
cost : pylearn2.costs.cost.Cost, optional
Cost object specifying the objective function to be minimized.
Optionally, may be None. In this case, SGD will call the model's
get_default_cost method to obtain the objective function.
batch_size : int, optional
The size of the batch to be used.
If not specified, the model will be asked for the batch size, so
you must have specified the batch size there.
(Some models are rigidly defined to only work with one batch size)
monitoring_batch_size : int, optional
The size of the monitoring batches.
monitoring_batches : int, optional
At the start of each epoch, we run "monitoring", to evaluate
quantities such as the validation set error.
monitoring_batches, if specified, determines the number of batches
to draw from the iterator for each monitoring dataset.
Unnecessary if not using monitoring or if `monitor_iteration_mode`
is 'sequential' and `batch_size` is specified (number of
batches will be calculated based on full dataset size).
TODO: make it possible to specify different monitoring_batches
for each monitoring dataset. The Monitor itself already supports
this.
monitoring_dataset : Dataset or dictionary, optional
If not specified, no monitoring is used.
If specified to be a Dataset, monitor on that Dataset.
If specified to be dictionary, the keys should be string names
of datasets, and the values should be Datasets. All monitoring
channels will be computed for all monitoring Datasets and will
have the dataset name and an underscore prepended to them.
monitor_iteration_mode : str, optional
The iteration mode used to iterate over the examples in all
monitoring datasets. If not specified, defaults to 'sequential'.
TODO: make it possible to specify different modes for different
datasets.
termination_criterion : instance of \
pylearn2.termination_criteria.TerminationCriterion, optional
Used to determine when the algorithm should stop running.
If not specified, runs forever--or more realistically, until
external factors halt the python process (Kansas 1977).
update_callbacks : list, optional
If specified, each member of the list should be a callable that
accepts an SGD instance as its only argument.
All callbacks will be called with this SGD instance after each
SGD step.
learning_rule : training_algorithms.learning_rule.LearningRule, optional
A learning rule computes the new parameter values given old
parameters and first-order gradients. If learning_rule is None,
sgd.SGD will update parameters according to the standard SGD
learning rule:
.. code-block:: none
param := param - learning_rate * d cost / d param
This argument allows more sophisticated learning rules, such
as SGD with momentum.
init_momentum : float, **DEPRECATED** option
Use learning_rule instead.
If None, does not use momentum otherwise, use momentum and
initialize the momentum coefficient to init_momentum. Callbacks
can change this over time just like the learning rate. If the
gradient is the same on every step, then the update taken by the
SGD algorithm is scaled by a factor of 1/(1-momentum). See
section 9 of Geoffrey Hinton's "A Practical Guide to Training
Restricted Boltzmann Machines" for details.
set_batch_size : bool, optional
Defaults to False.
If True, and batch_size conflicts with model.force_batch_size,
will call model.set_batch_size(batch_size) in an attempt to
change model.force_batch_size
train_iteration_mode : str, optional
Defaults to 'shuffled_sequential'.
The iteration mode to use for iterating through training examples.
batches_per_iter : int, optional
The number of batches to draw from the iterator over training
examples.
If iteration mode is 'sequential' or 'shuffled_sequential', this
is unnecessary; when unspecified we will iterate over all examples.
theano_function_mode : a valid argument to theano.function's \
'mode' parameter, optional
The theano mode to compile the updates function with. Note that
pylearn2 includes some wraplinker modes that are not bundled with
theano. See pylearn2.devtools. These extra modes let you do
things like check for NaNs at every step, or record md5 digests
of all computations performed by the update function to help
isolate problems with nondeterminism.
monitoring_costs : list, optional
a list of Cost instances. The Monitor will also include all
channels defined by these Costs, even though we don't train
using them.
seed : valid argument to np.random.RandomState, optional
The seed used for the random number generate to be passed to the
training dataset iterator (if any)
"""
def __init__(self, learning_rate, cost=None, batch_size=None,
monitoring_batch_size=None, monitoring_batches=None,
monitoring_dataset=None, monitor_iteration_mode='sequential',
termination_criterion=None, update_callbacks=None,
learning_rule = None, init_momentum = None,
set_batch_size = False,
train_iteration_mode = None, batches_per_iter=None,
theano_function_mode = None, monitoring_costs=None,
seed=[2012, 10, 5], discriminator_steps=1):
self.discriminator_steps = discriminator_steps
if isinstance(cost, (list, tuple, set)):
raise TypeError("SGD no longer supports using collections of " +
"Costs to represent a sum of Costs. Use " +
"pylearn2.costs.cost.SumOfCosts instead.")
if init_momentum:
warnings.warn("init_momentum interface is deprecated and will "
"become officially unsuported as of May 9, 2014. Please use the "
"`learning_rule` parameter instead, providing an object of type "
"`pylearn2.training_algorithms.learning_rule.Momentum` instead")
# Convert to new interface under the hood.
self.learning_rule = Momentum(init_momentum)
else:
self.learning_rule = learning_rule
self.learning_rate = sharedX(learning_rate, 'learning_rate')
self.cost = cost
self.batch_size = batch_size
self.set_batch_size = set_batch_size
self.batches_per_iter = batches_per_iter
self._set_monitoring_dataset(monitoring_dataset)
self.monitoring_batch_size = monitoring_batch_size
self.monitoring_batches = monitoring_batches
self.monitor_iteration_mode = monitor_iteration_mode
if monitoring_dataset is None:
if monitoring_batch_size is not None:
raise ValueError("Specified a monitoring batch size " +
"but not a monitoring dataset.")
if monitoring_batches is not None:
raise ValueError("Specified an amount of monitoring batches " +
"but not a monitoring dataset.")
self.termination_criterion = termination_criterion
self._register_update_callbacks(update_callbacks)
if train_iteration_mode is None:
train_iteration_mode = 'shuffled_sequential'
self.train_iteration_mode = train_iteration_mode
self.first = True
self.rng = make_np_rng(seed, which_method=["randn","randint"])
self.theano_function_mode = theano_function_mode
self.monitoring_costs = monitoring_costs
def setup(self, model, dataset):
"""
Compiles the theano functions needed for the train method.
Parameters
----------
model : a Model instance
dataset : Dataset
"""
self.i = 0
if self.cost is None:
self.cost = model.get_default_cost()
inf_params = [param for param in model.get_params()
if np.any(np.isinf(param.get_value()))]
if len(inf_params) > 0:
raise ValueError("These params are Inf: "+str(inf_params))
if any([np.any(np.isnan(param.get_value()))
for param in model.get_params()]):
nan_params = [param for param in model.get_params()
if np.any(np.isnan(param.get_value()))]
raise ValueError("These params are NaN: "+str(nan_params))
self.model = model
self._synchronize_batch_size(model)
model._test_batch_size = self.batch_size
self.monitor = Monitor.get_monitor(model)
self.monitor._sanity_check()
# test if force batch size and batch size
if getattr(model, "force_batch_size", False) and \
any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for
dataset in self.monitoring_dataset.values()) and \
not has_uniform_batch_size(self.monitor_iteration_mode):
raise ValueError("Dataset size is not a multiple of batch size."
"You should set monitor_iteration_mode to "
"even_sequential, even_shuffled_sequential or "
"even_batchwise_shuffled_sequential")
data_specs = self.cost.get_data_specs(self.model)
mapping = DataSpecsMapping(data_specs)
space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
# Build a flat tuple of Theano Variables, one for each space.
# We want that so that if the same space/source is specified
# more than once in data_specs, only one Theano Variable
# is generated for it, and the corresponding value is passed
# only once to the compiled Theano function.
theano_args = []
for space, source in safe_zip(space_tuple, source_tuple):
name = '%s[%s]' % (self.__class__.__name__, source)
arg = space.make_theano_batch(name=name,
batch_size=self.batch_size)
theano_args.append(arg)
theano_args = tuple(theano_args)
# Methods of `self.cost` need args to be passed in a format compatible
# with data_specs
nested_args = mapping.nest(theano_args)
fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
self.on_load_batch = fixed_var_descr.on_load_batch
cost_value = self.cost.expr(model, nested_args,
** fixed_var_descr.fixed_vars)
if cost_value is not None and cost_value.name is None:
# Concatenate the name of all tensors in theano_args !?
cost_value.name = 'objective'
# Set up monitor to model the objective value, learning rate,
# momentum (if applicable), and extra channels defined by
# the cost
learning_rate = self.learning_rate
if self.monitoring_dataset is not None:
if (self.monitoring_batch_size is None and
self.monitoring_batches is None):
self.monitoring_batch_size = self.batch_size
self.monitoring_batches = self.batches_per_iter
self.monitor.setup(dataset=self.monitoring_dataset,
cost=self.cost,
batch_size=self.monitoring_batch_size,
num_batches=self.monitoring_batches,
extra_costs=self.monitoring_costs,
mode=self.monitor_iteration_mode)
dataset_name = self.monitoring_dataset.keys()[0]
monitoring_dataset = self.monitoring_dataset[dataset_name]
#TODO: have Monitor support non-data-dependent channels
self.monitor.add_channel(name='learning_rate',
ipt=None,
val=learning_rate,
data_specs=(NullSpace(), ''),
dataset=monitoring_dataset)
if self.learning_rule:
self.learning_rule.add_channels_to_monitor(
self.monitor,
monitoring_dataset)
params = list(model.get_params())
assert len(params) > 0
for i, param in enumerate(params):
if param.name is None:
param.name = 'sgd_params[%d]' % i
self.params = params
grads, updates = self.cost.get_gradients(model, nested_args,
** fixed_var_descr.fixed_vars)
if not isinstance(grads, OrderedDict):
raise TypeError(str(type(self.cost)) + ".get_gradients returned " +
"something with" + str(type(grads)) + "as its " +
"first member. Expected OrderedDict.")
for param in grads:
assert param in params
for param in params:
assert param in grads
lr_scalers = model.get_lr_scalers()
for key in lr_scalers:
if key not in params:
raise ValueError("Tried to scale the learning rate on " +\
str(key)+" which is not an optimization parameter.")
assert len(updates.keys()) == 0
def get_func(learn_discriminator, learn_generator):
updates = OrderedDict()
assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator)
if learn_discriminator:
cur_params = model.discriminator.get_params()
else:
cur_params = model.generator.get_params()
cur_grads = OrderedDict()
for param in cur_params:
cur_grads[param] = grads[param]
for param in grads:
if grads[param].name is None and cost_value is not None:
grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
{'costname': cost_value.name,
'paramname': param.name})
assert grads[param].dtype == param.dtype
cur_lr_scalers = OrderedDict()
for param in cur_params:
if param in lr_scalers:
lr_scaler = lr_scalers[param]
cur_lr_scalers[param] = lr_scaler
log.info('Parameter and initial learning rate summary:')
for param in cur_params:
param_name = param.name
if param_name is None:
param_name = 'anon_param'
lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.)
log.info('\t' + param_name + ': ' + str(lr))
if self.learning_rule:
updates.update(self.learning_rule.get_updates(
learning_rate, cur_grads, cur_lr_scalers))
else:
# Use standard SGD updates with fixed learning rate.
updates.update( dict(safe_zip(params, [param - learning_rate * \
lr_scalers.get(param, 1.) * grads[param]
for param in params])))
for param in cur_params:
if updates[param].name is None:
updates[param].name = 'sgd_update(' + param.name + ')'
model.modify_updates(updates)
for param in cur_params:
update = updates[param]
if update.name is None:
update.name = 'censor(sgd_update(' + param.name + '))'
for update_val in get_debug_values(update):
if np.any(np.isinf(update_val)):
raise ValueError("debug value of %s contains infs" %
update.name)
if np.any(np.isnan(update_val)):
raise ValueError("debug value of %s contains nans" %
update.name)
with log_timing(log, 'Compiling sgd_update'):
return function(theano_args,
updates=updates,
name='sgd_update',
on_unused_input='ignore',
mode=self.theano_function_mode)
self.d_func = get_func(1, 0)
self.g_func = get_func(0, 1)
def train(self, dataset):
"""
Runs one epoch of SGD training on the specified dataset.
Parameters
----------
dataset : Dataset
"""
if not hasattr(self, 'd_func'):
raise Exception("train called without first calling setup")
# Make sure none of the parameters have bad values
for param in self.params:
value = param.get_value(borrow=True)
if np.any(np.isnan(value)) or np.any(np.isinf(value)):
raise Exception("NaN in " + param.name)
self.first = False
rng = self.rng
if not is_stochastic(self.train_iteration_mode):
rng = None
data_specs = self.cost.get_data_specs(self.model)
# The iterator should be built from flat data specs, so it returns
# flat, non-redundent tuples of data.
mapping = DataSpecsMapping(data_specs)
space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
if len(space_tuple) == 0:
# No data will be returned by the iterator, and it is impossible
# to know the size of the actual batch.
# It is not decided yet what the right thing to do should be.
raise NotImplementedError("Unable to train with SGD, because "
"the cost does not actually use data from the data set. "
"data_specs: %s" % str(data_specs))
flat_data_specs = (CompositeSpace(space_tuple), source_tuple)
iterator = dataset.iterator(mode=self.train_iteration_mode,
batch_size=self.batch_size,
data_specs=flat_data_specs, return_tuple=True,
rng = rng, num_batches = self.batches_per_iter)
on_load_batch = self.on_load_batch
i = self.i
for batch in iterator:
for callback in on_load_batch:
callback(*batch)
self.d_func(*batch)
i += 1
if i == self.discriminator_steps:
# Generator doesn't actually use the data so we want to
# re-use this batch. Could save memory by making the
# code not expect data in the interface.
self.g_func(*batch)
i = 0
# iterator might return a smaller batch if dataset size
# isn't divisible by batch_size
# Note: if data_specs[0] is a NullSpace, there is no way to know
# how many examples would actually have been in the batch,
# since it was empty, so actual_batch_size would be reported as 0.
actual_batch_size = flat_data_specs[0].np_batch_size(batch)
self.monitor.report_batch(actual_batch_size)
for callback in self.update_callbacks:
callback(self)
# Make sure none of the parameters have bad values
for param in self.params:
value = param.get_value(borrow=True)
if np.any(np.isnan(value)) or np.any(np.isinf(value)):
raise Exception("NaN in " + param.name)
self.i = i
def continue_learning(self, model):
"""
Returns True if the algorithm should continue running, or False
if it has reached convergence / started overfitting and should
stop.
Parameters
----------
model : a Model instance
"""
if self.termination_criterion is None:
return True
else:
return self.termination_criterion.continue_learning(self.model)
class MonitorBasedLRAdjuster(TrainExtension):
"""
A TrainExtension that uses the on_monitor callback to adjust
the learning rate on each epoch. It pulls out a channel
from the model's monitor and adjusts the learning rate
based on what happened to the monitoring channel on the last
epoch. If the channel is greater than high_trigger times
its previous value, the learning rate will be scaled by
shrink_amt (which should be < 1 for this scheme to make
sense). The idea is that in this case the learning algorithm
is overshooting the bottom of the objective function.
If the objective is less than high_trigger but
greater than low_trigger times its previous value, the
learning rate will be scaled by grow_amt (which should be > 1
for this scheme to make sense). The idea is that the learning
algorithm is making progress but at too slow of a rate.
Parameters
----------
high_trigger : float, optional
See class-level docstring
low_trigger : float, optional
See class-level docstring
grow_amt : float, optional
See class-level docstring
min_lr : float, optional
All updates to the learning rate are clipped to be at least
this value.
max_lr : float, optional
All updates to the learning rate are clipped to be at most
this value.
dataset_name : str, optional
If specified, use dataset_name + "_objective" as the channel
to guide the learning rate adaptation.
channel_name : str, optional
If specified, use channel_name as the channel to guide the
learning rate adaptation. Conflicts with dataset_name.
If neither dataset_name nor channel_name is specified, uses
"objective"
"""
def __init__(self, high_trigger=1., shrink_amt=.99,
low_trigger=.99, grow_amt=1.01,
min_lr = 1e-7, max_lr = 1.,
dataset_name=None, channel_name=None):
self.high_trigger = high_trigger
self.shrink_amt = shrink_amt
self.low_trigger = low_trigger
self.grow_amt = grow_amt
self.min_lr = min_lr
self.max_lr = max_lr
self.dataset_name = None
if channel_name is not None:
self.channel_name = channel_name
else:
if dataset_name is not None:
self.channel_name = dataset_name + '_objective'
self.dataset_name = dataset_name
else:
self.channel_name = None
def on_monitor(self, model, dataset, algorithm):
"""
Adjusts the learning rate based on the contents of model.monitor
Parameters
----------
model : a Model instance
dataset : Dataset
algorithm : WRITEME
"""
model = algorithm.model
lr = algorithm.learning_rate
current_learning_rate = lr.get_value()
assert hasattr(model, 'monitor'), ("no monitor associated with "
+ str(model))
monitor = model.monitor
monitor_channel_specified = True
if self.channel_name is None:
monitor_channel_specified = False
channels = [elem for elem in monitor.channels
if elem.endswith("objective")]
if len(channels) < 1:
raise ValueError("There are no monitoring channels that end "
"with \"objective\". Please specify either "
"channel_name or dataset_name.")
elif len(channels) > 1:
datasets = algorithm.monitoring_dataset.keys()
raise ValueError("There are multiple monitoring channels that"
"end with \"_objective\". The list of available "
"datasets are: " +
str(datasets) + " . Please specify either "
"channel_name or dataset_name in the "
"MonitorBasedLRAdjuster constructor to "
'disambiguate.')
else:
self.channel_name = channels[0]
warnings.warn('The channel that has been chosen for '
'monitoring is: ' +
str(self.channel_name) + '.')
try:
v = monitor.channels[self.channel_name].val_record
except KeyError:
err_input = ''
if monitor_channel_specified:
if self.dataset_name:
err_input = 'The dataset_name \'' + str(
self.dataset_name) + '\' is not valid.'
else:
err_input = 'The channel_name \'' + str(
self.channel_name) + '\' is not valid.'
err_message = 'There is no monitoring channel named \'' + \
str(self.channel_name) + '\'. You probably need to ' + \
'specify a valid monitoring channel by using either ' + \
'dataset_name or channel_name in the ' + \
'MonitorBasedLRAdjuster constructor. ' + err_input
raise ValueError(err_message)
if len(v) < 1:
if monitor.dataset is None:
assert len(v) == 0
raise ValueError("You're trying to use a monitor-based "
"learning rate adjustor but the monitor has no "
"entries because you didn't specify a "
"monitoring dataset.")
raise ValueError("For some reason there are no monitor entries"
"yet the MonitorBasedLRAdjuster has been "
"called. This should never happen. The Train"
" object should call the monitor once on "
"initialization, then call the callbacks. "
"It seems you are either calling the "
"callback manually rather than as part of a "
"training algorithm, or there is a problem "
"with the Train object.")
if len(v) == 1:
#only the initial monitoring has happened
#no learning has happened, so we can't adjust the learning rate yet
#just do nothing
return
rval = current_learning_rate
log.info("monitoring channel is {0}".format(self.channel_name))
if v[-1] > self.high_trigger * v[-2]:
rval *= self.shrink_amt
log.info("shrinking learning rate to %f" % rval)
elif v[-1] > self.low_trigger * v[-2]:
rval *= self.grow_amt
log.info("growing learning rate to %f" % rval)
rval = max(self.min_lr, rval)
rval = min(self.max_lr, rval)
lr.set_value(np.cast[lr.dtype](rval))
class PatienceBasedTermCrit(object):
"""
A monitor-based termination criterion using a geometrically increasing
amount of patience. If the selected channel has decreased by a certain
proportion when comparing to the lowest value seen yet, the patience is
set to a factor of the number of examples seen, which by default
(patience_increase=2.) ensures the model has seen as many examples as the
number of examples that lead to the lowest value before concluding a local
optima has been reached.
Note: Technically, the patience corresponds to a number of epochs to be
independent of the size of the dataset, so be aware of that when choosing
initial_patience.
Parameters
----------
prop_decrease : float
The factor X in the (1 - X) * best_value threshold
initial_patience : int
Minimal number of epochs the model has to run before it can stop
patience_increase : float, optional
The factor X in the patience = X * n_iter update.
channel_name : string, optional
Name of the channel to examine. If None and the monitor
has only one channel, this channel will be used; otherwise, an
error will be raised.
"""
def __init__(self, prop_decrease, initial_patience,
patience_increase=2., channel_name=None):
self._channel_name = channel_name
self.prop_decrease = prop_decrease
self.patience = initial_patience
self.best_value = np.inf
self.patience_increase = patience_increase
def __call__(self, model):
"""
Returns True or False depending on whether the optimization should
stop or not. The optimization should stop if it has run for a number
of epochs superior to the patience without any improvement.
Parameters
----------
model : Model
The model used in the experiment and from which the monitor used
in the termination criterion will be extracted.
Returns
-------
bool
True or False, indicating if the optimization should stop or not.
"""
monitor = model.monitor
# In the case the monitor has only one channel, the channel_name can
# be omitted and the criterion will examine the only channel
# available. However, if the monitor has multiple channels, leaving
# the channel_name unspecified will raise an error.
if self._channel_name is None:
if len(monitor.channels) != 1:
raise ValueError("Only single-channel monitors are supported "
"for channel_name == None")
v = monitor.channels.values()[0].val_record
else:
v = monitor.channels[self._channel_name].val_record
# If the channel value decrease is higher than the threshold, we
# update the best value to this value and we update the patience.
if v[-1] < self.best_value * (1. - self.prop_decrease):
# Using the max between actual patience and updated patience
# ensures that the model will run for at least the initial
# patience and that it would behave correctly if the user
# chooses a dumb value (i.e. less than 1)
self.patience = max(self.patience, len(v) * self.patience_increase)
self.best_value = v[-1]
return len(v) < self.patience
class AnnealedLearningRate(object):
"""
This is a callback for the SGD algorithm rather than the Train object.
This anneals the learning rate to decrease as 1/t where t is the number
of gradient descent updates done so far. Use OneOverEpoch as Train object
callback if you would prefer 1/t where t is epochs.
Parameters
----------
anneal_start : int
The epoch on which to begin annealing
"""
def __init__(self, anneal_start):
self._initialized = False
self._count = 0
self._anneal_start = anneal_start
def __call__(self, algorithm):
"""
Updates the learning rate according to the annealing schedule.
Parameters
----------
algorithm : WRITEME
"""
if not self._initialized:
self._base = algorithm.learning_rate.get_value()
self._count += 1
algorithm.learning_rate.set_value(self.current_learning_rate())
def current_learning_rate(self):
"""
Returns the current desired learning rate according to the
annealing schedule.
"""
return self._base * min(1, self._anneal_start / self._count)
class ExponentialDecay(object):
"""
This is a callback for the `SGD` algorithm rather than the `Train` object.
This anneals the learning rate by dividing by decay_factor after each
gradient descent step. It will not shrink the learning rate beyond
`min_lr`.
Parameters
----------
decay_factor : float
The learning rate at step t is given by
`init_learning_rate / (decay_factor ** t)`
min_lr : float
The learning rate will be clipped to be at least this value
"""
def __init__(self, decay_factor, min_lr):
if isinstance(decay_factor, str):
decay_factor = float(decay_factor)
if isinstance(min_lr, str):
min_lr = float(min_lr)
assert isinstance(decay_factor, float)
assert isinstance(min_lr, float)
self.__dict__.update(locals())
del self.self
self._count = 0
self._min_reached = False
def __call__(self, algorithm):
"""
Updates the learning rate according to the exponential decay schedule.
Parameters
----------
algorithm : SGD
The SGD instance whose `learning_rate` field should be modified.
"""
if self._count == 0:
self._base_lr = algorithm.learning_rate.get_value()
self._count += 1
if not self._min_reached:
# If we keep on executing the exponentiation on each mini-batch,
# we will eventually get an OverflowError. So make sure we
# only do the computation until min_lr is reached.
new_lr = self._base_lr / (self.decay_factor ** self._count)
if new_lr <= self.min_lr:
self._min_reached = True
new_lr = self.min_lr
else:
new_lr = self.min_lr
new_lr = np.cast[config.floatX](new_lr)
algorithm.learning_rate.set_value(new_lr)
class LinearDecay(object):
"""
This is a callback for the SGD algorithm rather than the Train object.
This anneals the learning rate to decay_factor times of the initial value
during time start till saturate.
Parameters
----------
start : int
The step at which to start decreasing the learning rate
saturate : int
The step at which to stop decreating the learning rate
decay_factor : float
`final learning rate = decay_factor * initial learning rate`
"""
def __init__(self, start, saturate, decay_factor):
if isinstance(decay_factor, str):
decay_factor = float(decay_factor)
if isinstance(start, str):
start = float(start)
if isinstance(saturate, str):
saturate = float(saturate)
assert isinstance(decay_factor, float)
assert isinstance(start, (py_integer_types, py_float_types))
assert isinstance(saturate, (py_integer_types, py_float_types))
assert saturate > start
assert start > 0
self.__dict__.update(locals())
del self.self
self._count = 0
def __call__(self, algorithm):
"""
Adjusts the learning rate according to the linear decay schedule
Parameters
----------
algorithm : WRITEME
"""
if self._count == 0:
self._base_lr = algorithm.learning_rate.get_value()
self._step = ((self._base_lr - self._base_lr * self.decay_factor) /
(self.saturate - self.start + 1))
self._count += 1
if self._count >= self.start:
if self._count < self.saturate:
new_lr = self._base_lr - self._step * (self._count
- self.start + 1)
else:
new_lr = self._base_lr * self.decay_factor
else:
new_lr = self._base_lr
assert new_lr > 0
new_lr = np.cast[config.floatX](new_lr)
algorithm.learning_rate.set_value(new_lr)
def MomentumAdjustor(final_momentum, start, saturate):
"""
Deprecated class used with the deprecated init_momentum argument.
Use learning_rule.MomentumAdjustor instead.
Parameters
----------
final_momentum : WRITEME
start : WRITEME
saturate : WRITEME
"""
warnings.warn("sgd.MomentumAdjustor interface is deprecated and will "
"become officially unsupported as of May 9, 2014. Please use "
"`learning_rule.MomentumAdjustor` instead.")
return LRMomentumAdjustor(final_momentum, start, saturate)
class OneOverEpoch(TrainExtension):
"""
Scales the learning rate like one over # epochs
Parameters
----------
start : int
The epoch on which to start shrinking the learning rate
half_life : int, optional
How many epochs after start it will take for the learning rate to lose
half its value for the first time (to lose the next half of its value
will take twice as long)
min_lr : float, optional
The minimum value the learning rate can take on
"""
def __init__(self, start, half_life = None, min_lr = 1e-6):
self.__dict__.update(locals())
del self.self
self._initialized = False
self._count = 0
assert start >= 0
if half_life is None:
self.half_life = start + 1
else:
assert half_life > 0
def on_monitor(self, model, dataset, algorithm):
"""
Adjusts the learning rate according to the decay schedule.
Parameters
----------
model : a Model instance
dataset : Dataset
algorithm : WRITEME
"""
if not self._initialized:
self._init_lr = algorithm.learning_rate.get_value()
if self._init_lr < self.min_lr:
raise ValueError("The initial learning rate is smaller than " +
"the minimum allowed learning rate.")
self._initialized = True
self._count += 1
algorithm.learning_rate.set_value(np.cast[config.floatX](
self.current_lr()))
def current_lr(self):
"""
Returns the learning rate currently desired by the decay schedule.
"""
if self._count < self.start:
scale = 1
else:
scale = float(self.half_life) / float(self._count - self.start
+ self.half_life)
lr = self._init_lr * scale
clipped = max(self.min_lr, lr)
return clipped
class LinearDecayOverEpoch(TrainExtension):
"""
Scales the learning rate linearly on each epochs
Parameters
----------
start : int
The epoch on which to start shrinking the learning rate
saturate : int
The epoch to saturate the shrinkage
decay_factor : float
The final value would be initial learning rate times decay_factor
"""
def __init__(self, start, saturate, decay_factor):
self.__dict__.update(locals())
del self.self
self._initialized = False
self._count = 0
assert isinstance(decay_factor, float)
assert isinstance(start, (py_integer_types, py_float_types))
assert isinstance(saturate, (py_integer_types, py_float_types))
assert saturate > start
assert start >= 0
assert saturate >= start
def on_monitor(self, model, dataset, algorithm):
"""
Updates the learning rate based on the linear decay schedule.
Parameters
----------
model : a Model instance
dataset : Dataset
algorithm : WRITEME
"""
if not self._initialized:
self._init_lr = algorithm.learning_rate.get_value()