From 14b4ab728ab57f2ac5d283554710288efc20a440 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sat, 3 Jun 2023 17:55:00 +0200 Subject: [PATCH 1/8] Start multi-objective optimization and explicit about opt vs eval Optimization metrics are those the AutoML framework should optimize for and evaluation metrics are metrics the final model is evaluated on. Optimization metrics will automatically also be used as evaluation metrics, but evaluation metrics may be defined to have additional metrics. --- amlb/benchmark.py | 63 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 7c54a344c..27a1da78a 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -24,9 +24,10 @@ from .datautils import read_csv from .resources import get as rget, config as rconfig, output_dirs as routput_dirs from .results import ErrorResult, Scoreboard, TaskResult -from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, json_dump, lazy_property, profile, repr_def, \ - run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, system_memory_mb, system_volume_mb, touch - +from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, \ + json_dump, lazy_property, profile, repr_def, \ + run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, \ + system_memory_mb, system_volume_mb, touch, Namespace log = logging.getLogger(__name__) @@ -371,9 +372,33 @@ def _is_task_enabled(task_def): class TaskConfig: - def __init__(self, name, fold, metrics, seed, + def __init__(self, *, name, fold, seed, max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb, - input_dir, output_dir): + input_dir, output_dir, + metrics: Union[list[str], str, None] = None, + optimization_metrics: Union[list[str], str, None] = None, + evaluation_metrics: Union[list[str], str, None] = None, + ): + + if metrics: + log.warning( + "WARNING: The `metric` field of the task definition is deprecated" + " and will not work in the future. Please specify the metric(s) to " + "optimize for with `optimization_metrics` and any additional metric(s) " + "used only for evaluation in `evaluation_metrics`." + ) + if optimization_metrics: + raise ValueError( + "Detected both `metric` and `optimization_metrics` for task " + f"'{name}'. Aborting because desired setup is unclear." + "Please only use `optimization_metrics`." + ) + optimization_metrics = as_list(metrics)[:1] + evaluation_metrics = as_list(metrics)[1:] + + self.optimization_metrics = optimization_metrics or [] + self._evaluation_metrics = evaluation_metrics or [] + self.framework = None self.framework_params = None self.framework_version = None @@ -391,12 +416,22 @@ def __init__(self, name, fold, metrics, seed, self.output_predictions_file = os.path.join(output_dir, "predictions.csv") self.ext = ns() # used if frameworks require extra config points + @property + def evaluation_metrics(self) -> list[str]: + return self.optimization_metrics + self._evaluation_metrics + + def load_default_metrics(self, *, dataset_type: str): + """ Sets `optimization/evaluation_metrics` based on defaults from config.yaml""" + metrics = as_list(rconfig().benchmarks.metrics[dataset_type]) + self.optimization_metrics = metrics[:1] + self._evaluation_metrics = metrics[1:] + def __setattr__(self, name, value): - if name == 'metrics': - self.metric = value[0] if isinstance(value, list) else value - elif name == 'max_runtime_seconds': - self.job_timeout_seconds = min(value * 2, - value + rconfig().benchmarks.overhead_time_seconds) + if name == 'max_runtime_seconds': + self.job_timeout_seconds = min( + value * 2, + value + rconfig().benchmarks.overhead_time_seconds + ) super().__setattr__(name, value) def __json__(self): @@ -458,10 +493,13 @@ def __init__(self, benchmark: Benchmark, task_def, fold): self.benchmark = benchmark self._task_def = task_def self.fold = fold + self.task_config = TaskConfig( name=task_def.name, fold=fold, metrics=task_def.metric, + optimization_metrics=Namespace.get(task_def, "optimization_metrics"), + evaluation_metrics=Namespace.get(task_def, "evaluation_metrics"), seed=rget().seed(fold), max_runtime_seconds=task_def.max_runtime_seconds, cores=task_def.cores, @@ -542,9 +580,8 @@ def run(self): task_config.output_predictions_file = results._predictions_file task_config.output_metadata_file = results._metadata_file touch(os.path.dirname(task_config.output_predictions_file), as_dir=True) - if task_config.metrics is None: - task_config.metrics = as_list(rconfig().benchmarks.metrics[self._dataset.type.name]) - task_config.metric = task_config.metrics[0] + if not task_config.optimization_metrics: + task_config.load_default_metrics(dataset_type=self._dataset.type.name) result = meta_result = None try: From aec3764ce866dbd82948cbca77967cd3d67e6345 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sat, 3 Jun 2023 21:53:16 +0200 Subject: [PATCH 2/8] Add support for multiple metrics in evaluation phase --- amlb/benchmark.py | 4 +++- amlb/results.py | 28 ++++++++++++++++------------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 27a1da78a..0f3ce8115 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -435,7 +435,9 @@ def __setattr__(self, name, value): super().__setattr__(name, value) def __json__(self): - return self.__dict__ + d = self.__dict__ + d["evaluation_metrics"] = self.evaluation_metrics + return d def __repr__(self): return repr_def(self) diff --git a/amlb/results.py b/amlb/results.py index aaceb9fc8..7504db7af 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -426,8 +426,11 @@ def compute_score(self, result=None, meta_result=None): seed=metadata.seed, app_version=rget().app_version, utc=datetime_iso(), - metric=metadata.metric, - duration=nan + evaluation_metrics=metadata.evaluation_metrics, + optimization_metrics=metadata.optimization_metrics, + duration=nan, + result_metrics=[], + result=[], ) required_meta_res = ['training_duration', 'predict_duration', 'models_count'] for m in required_meta_res: @@ -443,21 +446,22 @@ def do_score(m): return score def set_score(score): - entry.metric = score.metric - entry.result = score.value - if score.higher_is_better is False: # if unknown metric, and higher_is_better is None, then no change - entry.metric = f"neg_{entry.metric}" - entry.result = - entry.result + metric = score.metric if score.higher_is_better else f"neg_{score.metric}" + result = score.value if score.higher_is_better else -score.value + entry.result_metrics.append(metric) + entry.result.append(result) - for metric in metadata.metrics or []: + for metric in metadata.evaluation_metrics: sc = do_score(metric) entry[metric] = sc.value - if metric == entry.metric: + if metric in entry.optimization_metrics: set_score(sc) - if 'result' not in entry: - set_score(do_score(entry.metric)) - + entry.result = tuple(entry.result) + entry.result_metrics = tuple(entry.result_metrics) + entry.evaluation_metrics = tuple(entry.evaluation_metrics) + entry.optimization_metrics = tuple(entry.optimization_metrics) + entry.metric = entry.optimization_metrics entry.info = result.info if scoring_errors: entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors])) From 82e96db16d4f0d4a2f25fb02ef1d8c8fee930d0e Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 7 Jun 2023 11:49:35 +0200 Subject: [PATCH 3/8] Update scripts to use optimization metrics Though every script only optimizes towards one metric, will ask the framework authors to update accordingly. --- examples/custom/extensions/Stacking/exec.py | 4 ++-- frameworks/AutoGluon/exec.py | 4 ++-- frameworks/AutoGluon/exec_ts.py | 4 ++-- frameworks/AutoWEKA/exec.py | 5 +++-- frameworks/GAMA/exec.py | 5 +++-- frameworks/H2OAutoML/exec.py | 4 ++-- frameworks/MLPlan/exec.py | 6 +++--- frameworks/RandomForest/exec.py | 2 +- frameworks/TPOT/exec.py | 5 +++-- frameworks/TunedRandomForest/exec.py | 6 +++++- frameworks/autosklearn/exec.py | 4 ++-- frameworks/flaml/exec.py | 10 ++++++---- frameworks/hyperoptsklearn/exec.py | 6 +++++- frameworks/mljarsupervised/exec.py | 2 +- frameworks/oboe/exec.py | 2 +- 15 files changed, 41 insertions(+), 28 deletions(-) diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py index d8c80879d..84449550b 100644 --- a/examples/custom/extensions/Stacking/exec.py +++ b/examples/custom/extensions/Stacking/exec.py @@ -33,8 +33,8 @@ def run(dataset, config): estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final']} log.info("Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs)) - log.warning("We completely ignore the requirement to stay within the time limit.") - log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric)) + log.warning("We ignore the requirement to stay within the time limit.") + log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.") if is_classification: diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py index 6fe76769b..eab3db1e1 100644 --- a/frameworks/AutoGluon/exec.py +++ b/frameworks/AutoGluon/exec.py @@ -38,10 +38,10 @@ def run(dataset, config): rmse=metrics.root_mean_squared_error, ) - perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + perf_metric = metrics_mapping.get(config.optimization_metrics[0]) if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping - log.warning("Performance metric %s not supported.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.") is_classification = config.type == 'classification' training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py index ab7c4110f..eafdb4652 100644 --- a/frameworks/AutoGluon/exec_ts.py +++ b/frameworks/AutoGluon/exec_ts.py @@ -142,9 +142,9 @@ def get_eval_metric(config): rmse="RMSE", ) - eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + eval_metric = metrics_mapping.get(config.optimization_metrics[0]) if eval_metric is None: - log.warning("Performance metric %s not supported.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.") return eval_metric diff --git a/frameworks/AutoWEKA/exec.py b/frameworks/AutoWEKA/exec.py index 7b7534e0a..4727e40ae 100644 --- a/frameworks/AutoWEKA/exec.py +++ b/frameworks/AutoWEKA/exec.py @@ -24,9 +24,10 @@ def run(dataset: Dataset, config: TaskConfig): auc='areaUnderROC', logloss='kBInformation' ) - metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + metric = metrics_mapping.get(config.optimization_metrics[0]) if metric is None: - raise ValueError("Performance metric {} not supported.".format(config.metric)) + msg = f"Performance metric {config.optimization_metrics[0]} not supported." + raise ValueError(msg) train_file = dataset.train.path test_file = dataset.test.path diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py index e0880bf34..8d5860e03 100644 --- a/frameworks/GAMA/exec.py +++ b/frameworks/GAMA/exec.py @@ -43,9 +43,10 @@ def run(dataset, config): r2='r2', rmse='neg_mean_squared_error', ) - scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + scoring_metric = metrics_mapping.get(config.optimization_metrics[0]) if scoring_metric is None: - raise ValueError("Performance metric {} not supported.".format(config.metric)) + msg = f"Performance metric '{config.optimization_metrics[0]}' not supported." + raise ValueError(msg) training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} n_jobs = config.framework_params.get('_n_jobs', config.cores) # useful to disable multicore, regardless of the dataset config diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py index 1d70b9bb2..9492d9f3e 100644 --- a/frameworks/H2OAutoML/exec.py +++ b/frameworks/H2OAutoML/exec.py @@ -43,10 +43,10 @@ def run(dataset, config): rmse='rmse', rmsle='rmsle' ) - sort_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + sort_metric = metrics_mapping.get(config.optimization_metrics[0]) if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping - log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported, defaulting to AUTO.") try: training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} diff --git a/frameworks/MLPlan/exec.py b/frameworks/MLPlan/exec.py index 839d94d23..2aea045c6 100644 --- a/frameworks/MLPlan/exec.py +++ b/frameworks/MLPlan/exec.py @@ -30,10 +30,10 @@ def run(dataset, config): rmsle='ROOT_MEAN_SQUARED_LOGARITHM_ERROR', mae='MEAN_ABSOLUTE_ERROR' ) - - metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + metric = metrics_mapping.get(config.optimization_metrics[0]) if metric is None: - raise ValueError('Performance metric {} is not supported.'.format(config.metric)) + msg = f'Performance metric {config.optimization_metrics[0]} is not supported.' + raise ValueError(msg) train_file = dataset.train.path test_file = dataset.test.path diff --git a/frameworks/RandomForest/exec.py b/frameworks/RandomForest/exec.py index 77bdc99ef..193667fb8 100644 --- a/frameworks/RandomForest/exec.py +++ b/frameworks/RandomForest/exec.py @@ -47,7 +47,7 @@ def run(dataset, config): memory_margin = config.framework_params.get('_memory_margin', 0.9) log.info("Running RandomForest with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs)) - log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric)) + log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.") estimator = RandomForestClassifier if is_classification else RandomForestRegressor rf = estimator(n_jobs=n_jobs, diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py index ce70cb7f9..5766f76c6 100644 --- a/frameworks/TPOT/exec.py +++ b/frameworks/TPOT/exec.py @@ -36,9 +36,10 @@ def run(dataset, config): r2='r2', rmse='neg_mean_squared_error', # TPOT can score on mse, as app computes rmse independently on predictions ) - scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + scoring_metric = metrics_mapping.get(config.optimization_metrics[0]) if scoring_metric is None: - raise ValueError("Performance metric {} not supported.".format(config.metric)) + msg = f"Performance metric {config.optimization_metrics[0]} not supported." + raise ValueError(msg) X_train = dataset.train.X y_train = dataset.train.y diff --git a/frameworks/TunedRandomForest/exec.py b/frameworks/TunedRandomForest/exec.py index 7c7a7dc15..897636dd5 100644 --- a/frameworks/TunedRandomForest/exec.py +++ b/frameworks/TunedRandomForest/exec.py @@ -76,7 +76,11 @@ def run(dataset, config): mse='neg_mean_squared_error', r2='r2', rmse='neg_root_mean_squared_error', - )[config.metric] + ).get(config.optimization_metrics[0]) + + if not metric: + msg = f"TunedRandomForest doesn't support {config.optimization_metrics[0]}" + raise ValueError(msg) n_features = X_train.shape[1] default_value = max(1, int(math.sqrt(n_features))) diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py index 11690e09c..b612c8f6f 100644 --- a/frameworks/autosklearn/exec.py +++ b/frameworks/autosklearn/exec.py @@ -45,10 +45,10 @@ def run(dataset, config): rmse=metrics.mean_squared_error if askl_version < version.parse("0.10") else metrics.root_mean_squared_error, r2=metrics.r2 ) - perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + perf_metric = metrics_mapping.get(config.optimization_metrics[0]) if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping - log.warning("Performance metric %s not supported.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.") # Set resources based on datasize log.info( diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py index a8a5131af..e458c3faa 100644 --- a/frameworks/flaml/exec.py +++ b/frameworks/flaml/exec.py @@ -32,10 +32,12 @@ def run(dataset, config): rmse='rmse', r2='r2', ) - perf_metric = metrics_mapping[ - config.metric] if config.metric in metrics_mapping else 'auto' - if perf_metric is None: - log.warning("Performance metric %s not supported.", config.metric) + perf_metric = metrics_mapping.get(config.optimization_metrics[0], 'auto') + if perf_metric == 'auto' and config.optimization_metrics[0] != 'auto': + log.warning( + f"Performance metric '{config.optimization_metrics[0]}' not supported, " + f"using metric='auto' instead.", + ) training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} diff --git a/frameworks/hyperoptsklearn/exec.py b/frameworks/hyperoptsklearn/exec.py index c4179cae1..44857707b 100644 --- a/frameworks/hyperoptsklearn/exec.py +++ b/frameworks/hyperoptsklearn/exec.py @@ -36,7 +36,11 @@ def run(dataset, config): r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred) rmse=(mean_squared_error, False), ) - loss_fn, continuous_loss_fn = metrics_to_loss_mapping[config.metric] if config.metric in metrics_to_loss_mapping else (None, False) + + loss_fn, continuous_loss_fn = metrics_to_loss_mapping.get( + config.optimization_metrics[0], + (None, False) + ) if loss_fn is None: log.warning("Performance metric %s not supported: defaulting to %s.", config.metric, 'accuracy' if is_classification else 'r2') diff --git a/frameworks/mljarsupervised/exec.py b/frameworks/mljarsupervised/exec.py index 653d9cfd6..8ad65815a 100644 --- a/frameworks/mljarsupervised/exec.py +++ b/frameworks/mljarsupervised/exec.py @@ -24,7 +24,7 @@ def run(dataset, config): logloss='logloss', rmse='rmse' ) - eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else "auto" + eval_metric = metrics_mapping.get(config.optimization_metrics[0], 'auto') # Mapping of benchmark task to MLJAR ML task problem_mapping = dict( diff --git a/frameworks/oboe/exec.py b/frameworks/oboe/exec.py index bed388dc7..f47e43941 100644 --- a/frameworks/oboe/exec.py +++ b/frameworks/oboe/exec.py @@ -66,7 +66,7 @@ def run(dataset, config): n_cores = config.framework_params.get('_n_cores', config.cores) log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores)) - log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric)) + log.warning(f'We ignore the advice to optimize for: {config.optimization_metrics[0]}.') aml = AutoLearner(p_type='classification' if is_classification else 'regression', n_cores=n_cores, From e71bfab637ff912867d7affd54269b3f0ccaa89f Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 7 Jun 2023 12:47:08 +0200 Subject: [PATCH 4/8] Change the default config to use optimization and evaluation metrics --- amlb/benchmark.py | 7 +++---- resources/config.yaml | 18 +++++++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 0f3ce8115..a6bdc51bf 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -418,13 +418,12 @@ def __init__(self, *, name, fold, seed, @property def evaluation_metrics(self) -> list[str]: - return self.optimization_metrics + self._evaluation_metrics + return list(set(self.optimization_metrics) | set(self._evaluation_metrics)) def load_default_metrics(self, *, dataset_type: str): """ Sets `optimization/evaluation_metrics` based on defaults from config.yaml""" - metrics = as_list(rconfig().benchmarks.metrics[dataset_type]) - self.optimization_metrics = metrics[:1] - self._evaluation_metrics = metrics[1:] + self.optimization_metrics = as_list(rconfig().benchmarks.optimization_metrics[dataset_type]) + self._evaluation_metrics = as_list(rconfig().benchmarks.evaluation_metrics[dataset_type]) def __setattr__(self, name, value): if name == 'max_runtime_seconds': diff --git a/resources/config.yaml b/resources/config.yaml index 4daf8087b..ca9ec6503 100644 --- a/resources/config.yaml +++ b/resources/config.yaml @@ -48,13 +48,17 @@ benchmarks: # configuration namespace for the benchmarks def os_mem_size_mb: 2048 # the default amount of memory left to the OS when task assigned memory is computed automatically. os_vol_size_mb: 2048 # the default amount of volume left to the OS when task volume memory is verified. overhead_time_seconds: 3600 # amount of additional time allowed for the job to complete before sending an interruption signal - metrics: # default metrics by dataset type (as listed by amlb.data.DatasetType), - # only the first metric is optimized by the frameworks, - # the others are computed only for information purpose. - binary: ['auc', 'logloss', 'acc', 'balacc'] # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error). - multiclass: ['logloss', 'acc', 'balacc'] # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average. - regression: ['rmse', 'r2', 'mae'] # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2). - timeseries: ['mase', 'mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps'] + optimization_metrics: # default metrics to optimize for by dataset type (as listed by amlb.data.DatasetType) + binary: ['auc'] # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error). + multiclass: ['logloss'] # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average. + regression: ['rmse'] # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2). + timeseries: ['mase'] + evaluation_metrics: # default metrics to report by dataset type (as listed by amlb.data.DatasetType) + binary: ['logloss', 'acc', 'balacc'] # available metrics are identical to those described for `optimization_metrics`. + multiclass: ['acc', 'balacc'] # any `optimization_metric` is also reported on, duplicating it here is not necessary. + regression: ['r2', 'mae'] + timeseries: ['mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps'] + defaults: # the default constraints, usually overridden by a constraint. folds: 10 # the amount of fold-runs executed for each dataset. max_runtime_seconds: 3600 # default time allocated to the framework to train a model. From d07c7571f349e328c80492e56ac4d41b14f74923 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 7 Jun 2023 12:53:18 +0200 Subject: [PATCH 5/8] Return copy of dict instead of modifying self.__dict__ --- amlb/benchmark.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/amlb/benchmark.py b/amlb/benchmark.py index a6bdc51bf..c504928b7 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -434,9 +434,7 @@ def __setattr__(self, name, value): super().__setattr__(name, value) def __json__(self): - d = self.__dict__ - d["evaluation_metrics"] = self.evaluation_metrics - return d + return self.__dict__ | {"evaluation_metrics": self.evaluation_metrics} def __repr__(self): return repr_def(self) From 53472a43f8437c2abeded7d5d3da3b64c8f3b5cb Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 7 Jun 2023 15:50:34 +0200 Subject: [PATCH 6/8] Remove index_cols as it was not used --- amlb/results.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/amlb/results.py b/amlb/results.py index 7504db7af..22bc6b0b7 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -130,8 +130,6 @@ def __init__(self, scores=None, framework_name=None, benchmark_name=None, task_n @cached def as_data_frame(self): - # index = ['task', 'framework', 'fold'] - index = [] df = (self.scores if is_data_frame(self.scores) else to_data_frame([dict(sc) for sc in self.scores])) if df.empty: @@ -139,14 +137,12 @@ def as_data_frame(self): return df fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version', 'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info'] - fixed_cols = [col for col in fixed_cols if col not in index] metrics_cols = [col for col in df.columns if (col in dir(ClassificationResult) or col in dir(RegressionResult)) and not col.startswith('_')] metrics_cols.sort() dynamic_cols = [col for col in df.columns - if col not in index - and col not in fixed_cols + if col not in fixed_cols and col not in metrics_cols] dynamic_cols.sort() df = df.reindex(columns=[]+fixed_cols+metrics_cols+dynamic_cols) From f7989407c8e7a0cc566c38cf9e192cf597381ead Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 7 Jun 2023 16:28:24 +0200 Subject: [PATCH 7/8] Use metadata to identify metrics in centralized way --- amlb/results.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/amlb/results.py b/amlb/results.py index 22bc6b0b7..46faeabdc 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -2,6 +2,7 @@ **results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``), as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``). """ +import inspect from functools import partial import collections import io @@ -10,7 +11,7 @@ import os import re import statistics -from typing import Union +from typing import Union, Callable import numpy as np from numpy import nan, sort @@ -137,13 +138,13 @@ def as_data_frame(self): return df fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version', 'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info'] - metrics_cols = [col for col in df.columns - if (col in dir(ClassificationResult) or col in dir(RegressionResult)) - and not col.startswith('_')] + metrics_cols = [ + col for col in df.columns + if col in ClassificationResult.metrics() + RegressionResult.metrics() + ] metrics_cols.sort() dynamic_cols = [col for col in df.columns - if col not in fixed_cols - and col not in metrics_cols] + if col not in fixed_cols + metrics_cols] dynamic_cols.sort() df = df.reindex(columns=[]+fixed_cols+metrics_cols+dynamic_cols) log.debug("Scores columns: %s.", df.columns) @@ -501,6 +502,14 @@ def evaluate(self, metric): eval_res += Namespace(value=nan, higher_is_better=None, message=f"Unsupported metric `{metric}` for {pb_type} problems") return eval_res + @classmethod + def metrics(cls) -> list[str]: + def has_metric_metadata(fn: Callable) -> bool: + return get_metadata(fn, "higher_is_better") is not None + return [ + name for name, _ in inspect.getmembers(cls, predicate=has_metric_metadata) + ] + class NoResult(Result): From f793472021ac683ef10cb44621afce671799ef49 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 9 Jun 2023 11:37:50 +0200 Subject: [PATCH 8/8] Rewrite to avoid tuples in dataframe/csv We remove the the `metric` column is now updated to be `optimization_metrics`, which is a character-separated field listing each metric. We removed the result column, since it would otherwise contain tuples, and the data is duplicate with the individual metric columns (except for the convenience of having an 'higher is better' column, which we now lose). --- amlb/results.py | 40 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/amlb/results.py b/amlb/results.py index 46faeabdc..4ed79972b 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -3,7 +3,6 @@ as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``). """ import inspect -from functools import partial import collections import io import logging @@ -136,7 +135,7 @@ def as_data_frame(self): if df.empty: # avoid dtype conversions during reindexing on empty frame return df - fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version', + fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'optimization_metrics', 'mode', 'version', 'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info'] metrics_cols = [ col for col in df.columns @@ -171,9 +170,12 @@ def as_printable_data_frame(self, verbosity=3): for col in high_precision_float_cols: df[col] = df[col].map("{:.6g}".format).astype(float) + unique_metrics = (set(metrics.split(",")) for metrics in df['optimization_metrics'].unique()) + optimized_metrics = set.union(*unique_metrics) + cols = ([] if verbosity == 0 - else ['task', 'fold', 'framework', 'constraint', 'result', 'metric', 'info'] if verbosity == 1 - else ['id', 'task', 'fold', 'framework', 'constraint', 'result', 'metric', + else ['task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics', 'info'] if verbosity == 1 + else ['id', 'task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics', 'duration', 'seed', 'info'] if verbosity == 2 else slice(None)) return df.loc[:, cols] @@ -423,11 +425,8 @@ def compute_score(self, result=None, meta_result=None): seed=metadata.seed, app_version=rget().app_version, utc=datetime_iso(), - evaluation_metrics=metadata.evaluation_metrics, optimization_metrics=metadata.optimization_metrics, duration=nan, - result_metrics=[], - result=[], ) required_meta_res = ['training_duration', 'predict_duration', 'models_count'] for m in required_meta_res: @@ -435,30 +434,13 @@ def compute_score(self, result=None, meta_result=None): result = self.get_result() if result is None else result scoring_errors = [] - - def do_score(m): - score = result.evaluate(m) + for metric_ in metadata.evaluation_metrics: + score = result.evaluate(metric_) if 'message' in score: scoring_errors.append(score.message) - return score - - def set_score(score): - metric = score.metric if score.higher_is_better else f"neg_{score.metric}" - result = score.value if score.higher_is_better else -score.value - entry.result_metrics.append(metric) - entry.result.append(result) - - for metric in metadata.evaluation_metrics: - sc = do_score(metric) - entry[metric] = sc.value - if metric in entry.optimization_metrics: - set_score(sc) - - entry.result = tuple(entry.result) - entry.result_metrics = tuple(entry.result_metrics) - entry.evaluation_metrics = tuple(entry.evaluation_metrics) - entry.optimization_metrics = tuple(entry.optimization_metrics) - entry.metric = entry.optimization_metrics + entry[metric_] = score.value + + entry.optimization_metrics = ','.join(entry.optimization_metrics) entry.info = result.info if scoring_errors: entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors]))