-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexperiment.py
179 lines (155 loc) · 7.94 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
###########################################
## THIS IS THE GENERIC EXPERIMENT SCRIPT ##
###########################################
# Check folder 'experiments/' to find the actual exps
if __name__ == '__main__':
print("You should not call this directly. Check folder `experiments`.")
import sys
sys.exit()
import os
import datetime
import time
from pathlib import Path
import shutil
import torch
from brutelogger import BruteLogger
import typhon
import utils
class Experiment:
def __init__(self, cfg):
self.cfg = cfg
assert (not self.cfg['resume']) or (not self.cfg['timestamp']), "Cannot resume experiment with timestamp activated"
assert (not self.cfg['transfer'] == 'sequential') or (not self.cfg['resume']), "Cannot resume training on sequential"
self.make_paths()
# Setup logger
BruteLogger.save_stdout_to_file(path=self.paths['logs'])
# Resolve CPU threads and cuda device
torch.set_num_threads(self.cfg['trg_n_cpu'])
if torch.cuda.is_available():
# Need to have a GPU and to precise it at the end of the experiment file name
# Or in the terminal after the file name
# Assertion blocks if we cannot cast to int, i.e. last part of experiment file name is not an int
assert isinstance(int(self.cfg['trg_gpu']), int), "Please precise your GPU at the end of the experiment file name"
# Will anyway stop if the index is not available or wrong
self.cuda_device = f"cuda:{int(self.cfg['trg_gpu'])}"
# Otherwise just go with CPU
else:
self.cuda_device = 'cpu'
torch.set_num_threads(self.cfg['trg_n_cpu'])
# Give a dropout, learning rate, optimizer and loss function specific to each DMs
self.dropouts = {}
self.lrates = {}
for type in self.cfg['dropouts'].keys():
self.dropouts[type] = [self.cfg['dropouts'][type][0], {name:dropout for name, dropout in zip(self.cfg['dsets'], self.cfg['dropouts'][type][1:])}]
for type in self.cfg['lrates'].keys():
self.lrates[type] = {name:lrate for name, lrate in zip(self.cfg['dsets'], self.cfg['lrates'][type])}
self.optimizers = {name:optim for name, optim in zip(self.cfg['dsets'], self.cfg['optimizers'])}
self.loss_functions = {name:fct for name, fct in zip(self.cfg['dsets'], self.cfg['loss_functions'])}
self.train_args = {
'paths' : self.paths,
'dsets_names' : self.cfg['dsets'],
'architecture' : self.cfg['architecture'],
'bootstrap_size' : self.cfg['bootstrap_size'],
'nb_batches_per_epoch' : self.cfg['nb_batches_per_epoch'],
'nb_epochs' : self.cfg['epochs'],
'lrates' : self.lrates,
'dropouts' : self.dropouts,
'batch_size' : self.cfg['batch_size'],
'loss_functions' : self.loss_functions,
'optim_class' : self.optimizers,
'opt_metrics' : self.cfg['opt_metrics'],
'cuda_device' : self.cuda_device,
'resume' : self.cfg['resume'],
}
print(f"> Config loaded successfully for {self.cfg['transfer']} training:")
# Print the config so it is written in the log file as well
for key, value in self.train_args.items():
if key == 'paths': continue
print(f">> {key}: {value}")
def make_paths(self):
# Local level/debug config: shorter runs
# Simply add your `os.uname().nodename` to the list.
is_local_run = os.uname().nodename in ['example_os_name']
if is_local_run:
self.cfg.update({
'nb_batches_per_epoch' : 1,
'epochs' : {
'train' : 10,
'spec' : 10,
},
# Paths and filenames
'dsets_path' : 'datasets/tiny',
'bootstrap_size' : 10,
})
# Make Path objects
self.cfg.update({
'dsets_path' : Path(self.cfg['dsets_path']),
'ramdir' : Path(self.cfg['ramdir']),
'out_path' : Path(self.cfg['out_path']),
'exp_file' : Path(self.cfg['exp_file']),
})
# Copy dataset to ram for optimization
# The slash operator '/' in the pathlib module is similar to os.path.join()
dsets_path_ram = self.cfg['ramdir'] / self.cfg['dsets_path']
if not is_local_run and not dsets_path_ram.is_dir():
import shutil
shutil.copytree(self.cfg['dsets_path'], dsets_path_ram)
# All paths in one place
if self.cfg['timestamp']:
# Add timestamp in folder name to avoid duplicates
experiment_path = self.cfg['out_path'] / f"{self.cfg['exp_file'].stem}_{datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}"
else:
experiment_path = self.cfg['out_path'] / f"{self.cfg['exp_file'].stem}"
assert (not self.cfg['resume']) or experiment_path.is_dir(), ("Folder experiment does not exist, "
"either run experiment from the beginning or remove timestamp from folder name")
models_path = experiment_path / 'models'
self.paths = {
'experiment' : experiment_path,
# Brutelogger logs
'logs' : experiment_path / 'run_logs',
'dsets' : {d: self.cfg['dsets_path'] / f"{d}" for d in self.cfg['dsets']}
if is_local_run else {d: dsets_path_ram / f"{d}" for d in self.cfg['dsets']},
# Trained model (no specialization)
# p for parallel and s for sequential
'train_model_p' : models_path / 'train_model_p.pth',
'train_model_s' : models_path / 'train_model_s.pth',
# Model saved after the "normal training" in hydra
'gen_model_s' : models_path / 'gen_model_s.pth',
# Specialized models
'spec_models_p' : {d: models_path / f"spec_model_{d}_p.pth" for d in self.cfg['dsets']},
'spec_models_s' : {d: models_path / f"spec_model_{d}_s.pth" for d in self.cfg['dsets']},
# bootstrap model
'bootstrap_model' : models_path / 'bootstrap_model.pth',
# Plots
'metrics' : experiment_path / 'run_plot',
}
# Create directories
self.paths['metrics'].mkdir(parents=True, exist_ok=True)
self.paths['logs'].mkdir(parents=True, exist_ok=True)
models_path.mkdir(parents=True, exist_ok=True)
def main_run(self):
start = time.perf_counter()
# Need this for sequential learning
assert self.cfg['trg_dset'] == self.cfg['dsets'][0], "Target dataset must be in first position"
assert self.cfg['transfer'] in ['sequential', 'parallel'], "Please transfer argument must be 'sequential' or 'parallel'"
# Copy the experiment.py and exp cfg file in the experiment dir
shutil.copy2(self.cfg['exp_file'], self.paths['experiment'])
shutil.copy2('experiment.py', self.paths['experiment'])
self.typhon = typhon.Typhon(**self.train_args)
# Ensure bootstrap is initialized
if not self.paths['bootstrap_model'].is_file():
print("> Bootstrap initialization missing:", self.paths['bootstrap_model'])
self.typhon.bootstrap()
if self.cfg['transfer'] == 'sequential':
self.typhon.s_train(self.paths['bootstrap_model'])
self.typhon.s_specialization(self.paths['train_model_s'])
if self.cfg['transfer'] == 'parallel':
if self.cfg['resume']:
self.typhon.p_train(self.paths['train_model_p'])
else:
self.typhon.p_train(self.paths['bootstrap_model'])
self.typhon.p_specialization(self.paths['train_model_p'])
stop = time.perf_counter()
total_time = stop - start
print(f"Experiment ended in {int(total_time / 3600)} hours {int((total_time % 3600) / 60)} minutes {total_time % 60:.1f} seconds")
utils.print_time('END EXPERIMENT')