Skip to content

Commit

Permalink
fix scheduler steps on resume
Browse files Browse the repository at this point in the history
  • Loading branch information
pbenner committed Jan 18, 2025
1 parent a466d37 commit de5ef09
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
7 changes: 6 additions & 1 deletion equitrain/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ def _train_with_accelerator(args, accelerator: Accelerator):
""" Optimizer and LR Scheduler """
optimizer = create_optimizer(args, model)
lr_scheduler = create_scheduler(args, optimizer)
last_lr = None

criterion = GenericLossFn(**vars(args))

Expand Down Expand Up @@ -309,6 +308,12 @@ def _train_with_accelerator(args, accelerator: Accelerator):
if accelerator.is_main_process:
val_loss.log(logger, 'val', epoch=args.epochs_start - 1)

# Scheduler step before the first epoch for schedulers depending on the epoch
if lr_scheduler is not None:
lr_scheduler.step(metric=None, epoch=args.epochs_start - 1)

last_lr = lr_scheduler.get_last_lr()[0]

for epoch in range(args.epochs_start, args.epochs_start + args.epochs):
epoch_start_time = time.perf_counter()

Expand Down
10 changes: 6 additions & 4 deletions equitrain/train_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ def step(self, metric=None, epoch=None):
metric: The monitored metric (required for ReduceLROnPlateau).
"""
if self.mode == 'epoch':
self.scheduler.step(epoch=epoch)
if epoch is not None:
self.scheduler.step(epoch=epoch)

elif self.mode == 'metric':
if metric is None:
raise ValueError('Metric is required for ReduceLROnPlateau')
self.scheduler.step(metric)
if metric is not None:
self.scheduler.step(metric)

else:
raise ValueError(f'Unsupported mode: {self.mode}')

Expand Down

0 comments on commit de5ef09

Please sign in to comment.