Skip to content

Commit

Permalink
fix for best epoch training time
Browse files Browse the repository at this point in the history
  • Loading branch information
ksadowski13 committed Nov 23, 2021
1 parent 330aad6 commit a2a3bd8
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 15 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ __pycache__
.vscode
dataset
dgl
scratch.py
scratch*.py
scratch*.ipynb
88 changes: 82 additions & 6 deletions experiments/gat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import dgl.function as fn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import sigopt
import torch
Expand All @@ -20,9 +21,13 @@ def __init__(
self,
patience: int,
monitor: str,
timeout: float = None,
log_checkpoint_every: int = 5,
) -> None:
self._patience = patience
self._monitor = monitor
self._timeout = timeout
self._log_checkpoint_every = log_checkpoint_every
self._lookback = 0
self._best_epoch = None
self._train_times = []
Expand Down Expand Up @@ -63,7 +68,7 @@ def valid_accuracies(self) -> list[float]:

@property
def best_epoch_training_time(self) -> float:
return sum(self._train_times[:self._best_epoch])
return sum(self._train_times[:self._best_epoch + 1])

@property
def best_epoch_train_loss(self) -> float:
Expand All @@ -87,10 +92,26 @@ def best_epoch_model_parameters(
) -> Union[dict[str, torch.Tensor], dict[str, dict[str, torch.Tensor]]]:
return self._model_parameters

@property
def avg_train_time(self) -> float:
return np.mean(self._train_times)

@property
def avg_valid_time(self) -> float:
return np.mean(self._valid_times)

@property
def experiment_time(self) -> float:
return sum(self._train_times + self._valid_times)

@property
def should_stop(self) -> bool:
return self._lookback >= self._patience

@property
def timeout(self) -> bool:
return self.experiment_time >= self._timeout

def create(
self,
epoch: int,
Expand All @@ -110,7 +131,7 @@ def create(
self._train_accuracies.append(train_accuracy)
self._valid_accuracies.append(valid_accuracy)

if sigopt_context is not None:
if sigopt_context is not None and epoch % self._log_checkpoint_every == 0:
sigopt_context.log_checkpoint({
'train loss': train_loss,
'valid loss': valid_loss,
Expand Down Expand Up @@ -181,11 +202,21 @@ def get_metrics_plot(

def log_metrics_to_sigopt(
sigopt_context: sigopt.run_context,
checkpoint: Callback,
**metrics,
) -> None:
for name, value in metrics.items():
sigopt_context.log_metric(name=name, value=value)

metrics_plot = get_metrics_plot(
checkpoint.train_accuracies,
checkpoint.valid_accuracies,
checkpoint.train_losses,
checkpoint.valid_losses,
)

sigopt_context.log_image(metrics_plot, name='convergence plot')


def download_dataset(dataset: str) -> None:
if dataset == 'ogbn-products':
Expand Down Expand Up @@ -399,8 +430,53 @@ def get_evaluation_score(
return score


def is_experiment_finished(experiment) -> bool:
observation_count = experiment.fetch().progress.observation_count
observation_budget = experiment.fetch().observation_budget
def set_fanouts(
num_layers: int,
batch_size: int,
max_num_batch_nodes: int,
fanout_slope: float,
max_fanout: int = 40,
) -> list[int]:
result_fanouts = None

for base_fanout in range(max_fanout + 1):
fanouts = []

for n in range(num_layers):
fanout = int((fanout_slope ** n) * base_fanout)

if fanout < 1:
fanout = 1

if fanout > max_fanout:
fanout = max_fanout

fanouts.append(fanout)

if len(fanouts) == num_layers:
result = batch_size

for fanout in reversed(fanouts):
result += result * fanout

if result <= max_num_batch_nodes:
result_fanouts = fanouts

if result_fanouts is None:
result_fanouts = [1 for _ in range(num_layers)]

return result_fanouts


def save_checkpoints_to_csv(checkpoint: Callback, path: str) -> None:
data = {
'train_losses': checkpoint.train_losses,
'train_accuracies': checkpoint.train_accuracies,
'train_times': checkpoint.train_times,
'valid_losses': checkpoint.valid_losses,
'valid_accuracies': checkpoint.valid_accuracies,
'valid_times': checkpoint.valid_times,
}

return observation_count <= observation_budget
df = pd.DataFrame(data)
df.to_csv(path)
6 changes: 3 additions & 3 deletions experiments/gatv2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import sys
from copy import deepcopy
from typing import Union
from time import sleep

import dgl
import dgl.function as fn
Expand Down Expand Up @@ -69,7 +68,7 @@ def valid_accuracies(self) -> list[float]:

@property
def best_epoch_training_time(self) -> float:
return sum(self._train_times[:self._best_epoch])
return sum(self._train_times[:self.best_epoch])

@property
def best_epoch_train_loss(self) -> float:
Expand Down Expand Up @@ -208,7 +207,6 @@ def log_metrics_to_sigopt(
) -> None:
for name, value in metrics.items():
sigopt_context.log_metric(name=name, value=value)
sleep(0.3)

metrics_plot = get_metrics_plot(
checkpoint.train_accuracies,
Expand Down Expand Up @@ -404,10 +402,12 @@ def log_system_info() -> None:
sigopt.log_metadata("Python version", sys.version.split()[0])
sigopt.log_metadata("Operating System", sys.platform)
sigopt.log_metadata("psutil.Process().num_threads", process.num_threads())
# run.log_metadata("Process CPU Percent", process.cpu_percent())
sigopt.log_metadata("psutil.virtual_memory().total",
psutil._common.bytes2human(virtual_memory.total))
sigopt.log_metadata("psutil.virtual_memory().available",
psutil._common.bytes2human(virtual_memory.available))
# run.log_metadata("Virtual Memory Percent", virtual_memory.percent)


def get_evaluation_score(
Expand Down
6 changes: 3 additions & 3 deletions experiments/graphsage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import sys
from copy import deepcopy
from typing import Union
from time import sleep

import dgl
import dgl.function as fn
Expand Down Expand Up @@ -69,7 +68,7 @@ def valid_accuracies(self) -> list[float]:

@property
def best_epoch_training_time(self) -> float:
return sum(self._train_times[:self._best_epoch])
return sum(self._train_times[:self._best_epoch + 1])

@property
def best_epoch_train_loss(self) -> float:
Expand Down Expand Up @@ -208,7 +207,6 @@ def log_metrics_to_sigopt(
) -> None:
for name, value in metrics.items():
sigopt_context.log_metric(name=name, value=value)
sleep(0.3)

metrics_plot = get_metrics_plot(
checkpoint.train_accuracies,
Expand Down Expand Up @@ -404,10 +402,12 @@ def log_system_info() -> None:
sigopt.log_metadata("Python version", sys.version.split()[0])
sigopt.log_metadata("Operating System", sys.platform)
sigopt.log_metadata("psutil.Process().num_threads", process.num_threads())
# run.log_metadata("Process CPU Percent", process.cpu_percent())
sigopt.log_metadata("psutil.virtual_memory().total",
psutil._common.bytes2human(virtual_memory.total))
sigopt.log_metadata("psutil.virtual_memory().available",
psutil._common.bytes2human(virtual_memory.available))
# run.log_metadata("Virtual Memory Percent", virtual_memory.percent)


def get_evaluation_score(
Expand Down
2 changes: 1 addition & 1 deletion experiments/rgcn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def valid_accuracies(self) -> list[float]:

@property
def best_epoch_training_time(self) -> float:
return sum(self._train_times[:self._best_epoch])
return sum(self._train_times[:self._best_epoch + 1])

@property
def best_epoch_train_loss(self) -> float:
Expand Down
5 changes: 4 additions & 1 deletion experiments/run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ def run_experiment(args: argparse.ArgumentParser):
if args.dataset_root is not None:
arguments.append(f'--dataset-root {args.dataset_root}')

arguments.append('--test-validation')
if args.model != 'rgcn':
arguments.append('--test-validation')
else:
arguments.append('--no-test-validation')

if args.checkpoints_path is not None:
arguments.append('--save-checkpoints-to-csv')
Expand Down

0 comments on commit a2a3bd8

Please sign in to comment.