-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
248 lines (209 loc) · 9.29 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# imports
import os
import random
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import util
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
"""
This script loads classification results and trains various regression
estimators to evaluate effect of corrupted (false) labels on classification
model performance.
Various data properties can be set to define level of aggregation and filter
classification results before regression. This includes:
- regressor: Classification metric to run regression against
corrupted labels ratio
- filter: Filter classification results by classification
report files, e.g., to only use classification
report from epoch 19 of classification model training
- multi-regression: Run multiple regression - Note: currently not available
- cnn: Classification model results to include in regression
- classes: Classification task results to include in regression
- use_delta: Whether to use delta values for regression, i.e.,
difference between classification performance metric
and performance metric of same classification setup
but with 0 corrupted labels ratio
- group_by: Group classification results by classification setup
properties
- zscore_threshold: Threshold for z-score to filter classification results
TODO: update regression data properties and filter
TODO: review and update paths of classification results and regression logging
"""
# remove deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
#------------------------------------------------------------------------------#
print('Loading data...')
# define path and classification report file filter
regr_results_log_path = './logs/regr_results/' # where to log regression results
class_report_log_path = './logs/class_report'
explore_vis_log_path = './assets/'
aggr_class_report_log_path = './data/'
filter = 'class_report_19epoch.json'
# aggregate class reports
# TODO: uncomment on first run to aggregate classification reports
# util.aggr_class_reports(
# class_report_log_path,
# filter,
# aggr_class_report_log_path
# )
# load classification results table
df = util.load_aggr_class_reports(aggr_class_report_log_path)
#------------------------------------------------------------------------------#
print('Setting regression data properties...')
# --> TODO: set regression properties below <--
regressor = 'accuracy' # regressor to use
multi_regression = False # TODO: implement multi-regression
cnn = 'basic' # model to filter for ('resnet', 'basic'), False for all
classes = 4 # classification task to filter for (4 or 14), False for all
use_delta = False # use regressor delta to 0 ratio model for regression
group_by = {
'model': False,
'classes': False,
'ratio': False
} # classification setup to group data by
zscore_threshold = False # threshold for removing outlier, False for no filtering
use_grouping = any(group_by.values())
# check properties
if cnn not in ['resnet', 'basic', False] or classes not in [4, 14, False]:
raise ValueError('Invalid model or classes filter')
#------------------------------------------------------------------------------#
print('Initializing logging...')
# define regression properties for logging
regr_results = util.regr_results_logger(
group_by, regressor, cnn, classes, use_delta, zscore_threshold
)
#------------------------------------------------------------------------------#
print('Preprocessing data...')
# preprocess dataframe
df_regression = df.query(f'metric == "{regressor}"')
df_regression = df_regression.drop(['run'], axis=1)
df_regression = df_regression.rename(columns={'value': regressor})
df_regression = util.calc_delta(df, df_regression, regressor)
# filter data for set properties
df_regression = util.filter_data(
use_grouping, group_by, df_regression, cnn, classes
)
# filter outliers if property is set
df_regression = util.filter_outliers(
df_regression, regressor, use_grouping, zscore_threshold
)
df_regression.reset_index(inplace=True, drop=True)
print(f'\tRegression df.shape: {df_regression.shape}')
regr_results['properties']['n'] = df_regression.shape[0]
# view pair and boxplot of regression data
sns.pairplot(
df_regression[[regressor, f'{regressor}_delta', 'ratio']],
diag_kind='kde'
).fig.savefig(explore_vis_log_path + 'regr_pairplot.png')
plt.clf()
sns.boxplot(
x='ratio', y=regressor, data=df_regression
).get_figure().savefig(explore_vis_log_path + 'regr_boxplot.png')
print('\tPlots saved to ./assets/')
#------------------------------------------------------------------------------#
print('Setting up training...')
# get independent and dependent variables
if use_delta:
X_df = df_regression[[f'{regressor}_delta']]
print(f'\tUsing {regressor} delta')
else:
X_df = df_regression[[regressor]]
y_df = df_regression['ratio']
# calculate correlation (pearsonr and p-values)
for column in X_df.columns:
r, p = stats.pearsonr(X_df[column], y_df)
print(f'\t{round(r, 4)} pearsonr, {round(p, 10)} p-value, Variable: {column}')
# log correlation
regr_results['properties']['pearsonr'] = r
regr_results['properties']['p_value'] = p
# log random state for train/test split reproducibility
random_state = random.randint(0, 1000)
regr_results['properties']['split_random_state'] = random_state
# split data - random state to reproduce split in initial model selection
test_split = 0.2 if len(df_regression) > 100 else 0.1
# stratify only if test data can include all 10 ratios
if len(df_regression) * test_split > 9:
X_train, X_test, y_train, y_test = train_test_split(
X_df
, y_df
, test_size=test_split
, random_state=random_state
, stratify=y_df
)
print('\tStratified train/test split')
else:
X_train, X_test, y_train, y_test = train_test_split(
X_df
, y_df
, test_size=test_split
, random_state=random_state
)
# scale X data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(
f'\tX_train.shape {X_train.shape} - y_train.shape {y_train.shape}\
\n\tX_test.shape {X_test.shape} - y_test.shape {y_test.shape}'
)
#------------------------------------------------------------------------------#
print('Training estimators...')
# train regression estimators - see util.py for estimators
regr_results = util.train_estimators(regr_results, X_train, y_train)
# get estimator with best score
best_score = -999
for k, v in regr_results['regr_results'].items():
if v['neg_rmse'] > best_score:
best_score = v['neg_rmse']
best_estimator = k
print(f'\t--> Best estimator: {best_estimator} <--')
#------------------------------------------------------------------------------#
print('Searching for best hyperparameters...')
regr_results = util.hyperparameter_tuning(
best_estimator, X_train, y_train, X_test, y_test, regr_results, 5
)
#------------------------------------------------------------------------------#
print('Logging regression results...')
# setup regression logger properties
row = [[
regr_results['properties']['regressor'],
regr_results['properties']['regressand'],
regr_results['properties']['model'],
regr_results['properties']['classification_task'],
regr_results['properties']['n'],
regr_results['properties']['split_random_state'],
regr_results['properties']['group_by'],
regr_results['properties']['delta'],
regr_results['properties']['zscore_threshold'],
regr_results['properties']['pearsonr'],
regr_results['properties']['p_value'],
regr_results['best_estimator']['estimator'],
regr_results['best_estimator']['neg_rmse_train'],
regr_results['best_estimator']['neg_rmse_test'],
regr_results['best_estimator']['r2_train'],
regr_results['best_estimator']['r2_test'],
regr_results['best_estimator']['best_params'],
]]
# create result df
df_temp = pd.DataFrame(
data = row,
columns = [
'regressor', 'regressand', 'model', 'classification_task',
'n', 'split_random_state', 'group_by', 'delta', 'zscore_threshold',
'pearsonr', 'p_value', 'estimator', 'neg_rmse_train', 'neg_rmse_test',
'r2_train', 'r2_test', 'best_params'
]
)
# add results to regression results dataframe and save as csv
result_file = regr_results_log_path + "regr_results.csv"
if os.path.exists(result_file):
df_results = pd.read_csv(result_file)
df_results = pd.concat([df_results, df_temp])
df_results.to_csv(result_file, index=False)
else:
df_temp.to_csv(result_file, index=False)
print('\tResults saved to ' + result_file)