From 14b4ab728ab57f2ac5d283554710288efc20a440 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sat, 3 Jun 2023 17:55:00 +0200
Subject: [PATCH 1/8] Start multi-objective optimization and explicit about opt
 vs eval

Optimization metrics are those the AutoML framework should optimize for
and evaluation metrics are metrics the final model is evaluated on.
Optimization metrics will automatically also be used as evaluation
metrics, but evaluation metrics may be defined to have additional
metrics.
---
 amlb/benchmark.py | 63 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 7c54a344c..27a1da78a 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -24,9 +24,10 @@
 from .datautils import read_csv
 from .resources import get as rget, config as rconfig, output_dirs as routput_dirs
 from .results import ErrorResult, Scoreboard, TaskResult
-from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, json_dump, lazy_property, profile, repr_def, \
-    run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, system_memory_mb, system_volume_mb, touch
-
+from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, \
+    json_dump, lazy_property, profile, repr_def, \
+    run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, \
+    system_memory_mb, system_volume_mb, touch, Namespace
 
 log = logging.getLogger(__name__)
 
@@ -371,9 +372,33 @@ def _is_task_enabled(task_def):
 
 class TaskConfig:
 
-    def __init__(self, name, fold, metrics, seed,
+    def __init__(self, *, name, fold, seed,
                  max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
-                 input_dir, output_dir):
+                 input_dir, output_dir,
+                 metrics: Union[list[str], str, None] = None,
+                 optimization_metrics: Union[list[str], str, None] = None,
+                 evaluation_metrics: Union[list[str], str, None] = None,
+                 ):
+
+        if metrics:
+            log.warning(
+                "WARNING: The `metric` field of the task definition is deprecated"
+                " and will not work in the future. Please specify the metric(s) to "
+                "optimize for with `optimization_metrics` and any additional metric(s) "
+                "used only for evaluation in `evaluation_metrics`."
+            )
+            if optimization_metrics:
+                raise ValueError(
+                    "Detected both `metric` and `optimization_metrics` for task "
+                    f"'{name}'. Aborting because desired setup is unclear."
+                    "Please only use `optimization_metrics`."
+                )
+            optimization_metrics = as_list(metrics)[:1]
+            evaluation_metrics = as_list(metrics)[1:]
+
+        self.optimization_metrics = optimization_metrics or []
+        self._evaluation_metrics = evaluation_metrics or []
+
         self.framework = None
         self.framework_params = None
         self.framework_version = None
@@ -391,12 +416,22 @@ def __init__(self, name, fold, metrics, seed,
         self.output_predictions_file = os.path.join(output_dir, "predictions.csv")
         self.ext = ns()  # used if frameworks require extra config points
 
+    @property
+    def evaluation_metrics(self) -> list[str]:
+        return self.optimization_metrics + self._evaluation_metrics
+
+    def load_default_metrics(self, *, dataset_type: str):
+        """ Sets `optimization/evaluation_metrics` based on defaults from config.yaml"""
+        metrics = as_list(rconfig().benchmarks.metrics[dataset_type])
+        self.optimization_metrics = metrics[:1]
+        self._evaluation_metrics = metrics[1:]
+
     def __setattr__(self, name, value):
-        if name == 'metrics':
-            self.metric = value[0] if isinstance(value, list) else value
-        elif name == 'max_runtime_seconds':
-            self.job_timeout_seconds = min(value * 2,
-                                           value + rconfig().benchmarks.overhead_time_seconds)
+        if name == 'max_runtime_seconds':
+            self.job_timeout_seconds = min(
+                value * 2,
+                value + rconfig().benchmarks.overhead_time_seconds
+            )
         super().__setattr__(name, value)
 
     def __json__(self):
@@ -458,10 +493,13 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
         self.benchmark = benchmark
         self._task_def = task_def
         self.fold = fold
+
         self.task_config = TaskConfig(
             name=task_def.name,
             fold=fold,
             metrics=task_def.metric,
+            optimization_metrics=Namespace.get(task_def, "optimization_metrics"),
+            evaluation_metrics=Namespace.get(task_def, "evaluation_metrics"),
             seed=rget().seed(fold),
             max_runtime_seconds=task_def.max_runtime_seconds,
             cores=task_def.cores,
@@ -542,9 +580,8 @@ def run(self):
         task_config.output_predictions_file = results._predictions_file
         task_config.output_metadata_file = results._metadata_file
         touch(os.path.dirname(task_config.output_predictions_file), as_dir=True)
-        if task_config.metrics is None:
-            task_config.metrics = as_list(rconfig().benchmarks.metrics[self._dataset.type.name])
-            task_config.metric = task_config.metrics[0]
+        if not task_config.optimization_metrics:
+            task_config.load_default_metrics(dataset_type=self._dataset.type.name)
 
         result = meta_result = None
         try:

From aec3764ce866dbd82948cbca77967cd3d67e6345 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Sat, 3 Jun 2023 21:53:16 +0200
Subject: [PATCH 2/8] Add support for multiple metrics in evaluation phase

---
 amlb/benchmark.py |  4 +++-
 amlb/results.py   | 28 ++++++++++++++++------------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 27a1da78a..0f3ce8115 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -435,7 +435,9 @@ def __setattr__(self, name, value):
         super().__setattr__(name, value)
 
     def __json__(self):
-        return self.__dict__
+        d = self.__dict__
+        d["evaluation_metrics"] = self.evaluation_metrics
+        return d
 
     def __repr__(self):
         return repr_def(self)
diff --git a/amlb/results.py b/amlb/results.py
index aaceb9fc8..7504db7af 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -426,8 +426,11 @@ def compute_score(self, result=None, meta_result=None):
             seed=metadata.seed,
             app_version=rget().app_version,
             utc=datetime_iso(),
-            metric=metadata.metric,
-            duration=nan
+            evaluation_metrics=metadata.evaluation_metrics,
+            optimization_metrics=metadata.optimization_metrics,
+            duration=nan,
+            result_metrics=[],
+            result=[],
         )
         required_meta_res = ['training_duration', 'predict_duration', 'models_count']
         for m in required_meta_res:
@@ -443,21 +446,22 @@ def do_score(m):
             return score
 
         def set_score(score):
-            entry.metric = score.metric
-            entry.result = score.value
-            if score.higher_is_better is False:  # if unknown metric, and higher_is_better is None, then no change
-                entry.metric = f"neg_{entry.metric}"
-                entry.result = - entry.result
+            metric = score.metric if score.higher_is_better else f"neg_{score.metric}"
+            result = score.value if score.higher_is_better else -score.value
+            entry.result_metrics.append(metric)
+            entry.result.append(result)
 
-        for metric in metadata.metrics or []:
+        for metric in metadata.evaluation_metrics:
             sc = do_score(metric)
             entry[metric] = sc.value
-            if metric == entry.metric:
+            if metric in entry.optimization_metrics:
                 set_score(sc)
 
-        if 'result' not in entry:
-            set_score(do_score(entry.metric))
-
+        entry.result = tuple(entry.result)
+        entry.result_metrics = tuple(entry.result_metrics)
+        entry.evaluation_metrics = tuple(entry.evaluation_metrics)
+        entry.optimization_metrics = tuple(entry.optimization_metrics)
+        entry.metric = entry.optimization_metrics
         entry.info = result.info
         if scoring_errors:
             entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors]))

From 82e96db16d4f0d4a2f25fb02ef1d8c8fee930d0e Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 7 Jun 2023 11:49:35 +0200
Subject: [PATCH 3/8] Update scripts to use optimization metrics

Though every script only optimizes towards one metric, will ask
the framework authors to update accordingly.
---
 examples/custom/extensions/Stacking/exec.py |  4 ++--
 frameworks/AutoGluon/exec.py                |  4 ++--
 frameworks/AutoGluon/exec_ts.py             |  4 ++--
 frameworks/AutoWEKA/exec.py                 |  5 +++--
 frameworks/GAMA/exec.py                     |  5 +++--
 frameworks/H2OAutoML/exec.py                |  4 ++--
 frameworks/MLPlan/exec.py                   |  6 +++---
 frameworks/RandomForest/exec.py             |  2 +-
 frameworks/TPOT/exec.py                     |  5 +++--
 frameworks/TunedRandomForest/exec.py        |  6 +++++-
 frameworks/autosklearn/exec.py              |  4 ++--
 frameworks/flaml/exec.py                    | 10 ++++++----
 frameworks/hyperoptsklearn/exec.py          |  6 +++++-
 frameworks/mljarsupervised/exec.py          |  2 +-
 frameworks/oboe/exec.py                     |  2 +-
 15 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py
index d8c80879d..84449550b 100644
--- a/examples/custom/extensions/Stacking/exec.py
+++ b/examples/custom/extensions/Stacking/exec.py
@@ -33,8 +33,8 @@ def run(dataset, config):
     estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final']}
 
     log.info("Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
-    log.warning("We completely ignore the requirement to stay within the time limit.")
-    log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))
+    log.warning("We ignore the requirement to stay within the time limit.")
+    log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.")
 
 
     if is_classification:
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index 6fe76769b..eab3db1e1 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -38,10 +38,10 @@ def run(dataset, config):
         rmse=metrics.root_mean_squared_error,
     )
 
-    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    perf_metric = metrics_mapping.get(config.optimization_metrics[0])
     if perf_metric is None:
         # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
-        log.warning("Performance metric %s not supported.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.")
 
     is_classification = config.type == 'classification'
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
index ab7c4110f..eafdb4652 100644
--- a/frameworks/AutoGluon/exec_ts.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -142,9 +142,9 @@ def get_eval_metric(config):
         rmse="RMSE",
     )
 
-    eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    eval_metric = metrics_mapping.get(config.optimization_metrics[0])
     if eval_metric is None:
-        log.warning("Performance metric %s not supported.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.")
     return eval_metric
 
 
diff --git a/frameworks/AutoWEKA/exec.py b/frameworks/AutoWEKA/exec.py
index 7b7534e0a..4727e40ae 100644
--- a/frameworks/AutoWEKA/exec.py
+++ b/frameworks/AutoWEKA/exec.py
@@ -24,9 +24,10 @@ def run(dataset: Dataset, config: TaskConfig):
         auc='areaUnderROC',
         logloss='kBInformation'
     )
-    metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    metric = metrics_mapping.get(config.optimization_metrics[0])
     if metric is None:
-        raise ValueError("Performance metric {} not supported.".format(config.metric))
+        msg = f"Performance metric {config.optimization_metrics[0]} not supported."
+        raise ValueError(msg)
 
     train_file = dataset.train.path
     test_file = dataset.test.path
diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py
index e0880bf34..8d5860e03 100644
--- a/frameworks/GAMA/exec.py
+++ b/frameworks/GAMA/exec.py
@@ -43,9 +43,10 @@ def run(dataset, config):
         r2='r2',
         rmse='neg_mean_squared_error',
     )
-    scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    scoring_metric = metrics_mapping.get(config.optimization_metrics[0])
     if scoring_metric is None:
-        raise ValueError("Performance metric {} not supported.".format(config.metric))
+        msg = f"Performance metric '{config.optimization_metrics[0]}' not supported."
+        raise ValueError(msg)
 
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
     n_jobs = config.framework_params.get('_n_jobs', config.cores)  # useful to disable multicore, regardless of the dataset config
diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py
index 1d70b9bb2..9492d9f3e 100644
--- a/frameworks/H2OAutoML/exec.py
+++ b/frameworks/H2OAutoML/exec.py
@@ -43,10 +43,10 @@ def run(dataset, config):
         rmse='rmse',
         rmsle='rmsle'
     )
-    sort_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    sort_metric = metrics_mapping.get(config.optimization_metrics[0])
     if sort_metric is None:
         # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
-        log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported, defaulting to AUTO.")
 
     try:
         training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
diff --git a/frameworks/MLPlan/exec.py b/frameworks/MLPlan/exec.py
index 839d94d23..2aea045c6 100644
--- a/frameworks/MLPlan/exec.py
+++ b/frameworks/MLPlan/exec.py
@@ -30,10 +30,10 @@ def run(dataset, config):
         rmsle='ROOT_MEAN_SQUARED_LOGARITHM_ERROR',
         mae='MEAN_ABSOLUTE_ERROR'
     )
-
-    metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    metric = metrics_mapping.get(config.optimization_metrics[0])
     if metric is None:
-        raise ValueError('Performance metric {} is not supported.'.format(config.metric))
+        msg = f'Performance metric {config.optimization_metrics[0]} is not supported.'
+        raise ValueError(msg)
     
     train_file = dataset.train.path
     test_file = dataset.test.path
diff --git a/frameworks/RandomForest/exec.py b/frameworks/RandomForest/exec.py
index 77bdc99ef..193667fb8 100644
--- a/frameworks/RandomForest/exec.py
+++ b/frameworks/RandomForest/exec.py
@@ -47,7 +47,7 @@ def run(dataset, config):
     memory_margin = config.framework_params.get('_memory_margin', 0.9)
 
     log.info("Running RandomForest with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
-    log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))
+    log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.")
 
     estimator = RandomForestClassifier if is_classification else RandomForestRegressor
     rf = estimator(n_jobs=n_jobs,
diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py
index ce70cb7f9..5766f76c6 100644
--- a/frameworks/TPOT/exec.py
+++ b/frameworks/TPOT/exec.py
@@ -36,9 +36,10 @@ def run(dataset, config):
         r2='r2',
         rmse='neg_mean_squared_error',  # TPOT can score on mse, as app computes rmse independently on predictions
     )
-    scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    scoring_metric = metrics_mapping.get(config.optimization_metrics[0])
     if scoring_metric is None:
-        raise ValueError("Performance metric {} not supported.".format(config.metric))
+        msg = f"Performance metric {config.optimization_metrics[0]} not supported."
+        raise ValueError(msg)
 
     X_train = dataset.train.X
     y_train = dataset.train.y
diff --git a/frameworks/TunedRandomForest/exec.py b/frameworks/TunedRandomForest/exec.py
index 7c7a7dc15..897636dd5 100644
--- a/frameworks/TunedRandomForest/exec.py
+++ b/frameworks/TunedRandomForest/exec.py
@@ -76,7 +76,11 @@ def run(dataset, config):
         mse='neg_mean_squared_error',
         r2='r2',
         rmse='neg_root_mean_squared_error',
-    )[config.metric]
+    ).get(config.optimization_metrics[0])
+
+    if not metric:
+        msg = f"TunedRandomForest doesn't support {config.optimization_metrics[0]}"
+        raise ValueError(msg)
 
     n_features = X_train.shape[1]
     default_value = max(1, int(math.sqrt(n_features)))
diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
index 11690e09c..b612c8f6f 100644
--- a/frameworks/autosklearn/exec.py
+++ b/frameworks/autosklearn/exec.py
@@ -45,10 +45,10 @@ def run(dataset, config):
         rmse=metrics.mean_squared_error if askl_version < version.parse("0.10") else metrics.root_mean_squared_error,
         r2=metrics.r2
     )
-    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    perf_metric = metrics_mapping.get(config.optimization_metrics[0])
     if perf_metric is None:
         # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
-        log.warning("Performance metric %s not supported.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.")
 
     # Set resources based on datasize
     log.info(
diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py
index a8a5131af..e458c3faa 100644
--- a/frameworks/flaml/exec.py
+++ b/frameworks/flaml/exec.py
@@ -32,10 +32,12 @@ def run(dataset, config):
         rmse='rmse',
         r2='r2',
     )
-    perf_metric = metrics_mapping[
-        config.metric] if config.metric in metrics_mapping else 'auto'
-    if perf_metric is None:
-        log.warning("Performance metric %s not supported.", config.metric)
+    perf_metric = metrics_mapping.get(config.optimization_metrics[0], 'auto')
+    if perf_metric == 'auto' and config.optimization_metrics[0] != 'auto':
+        log.warning(
+            f"Performance metric '{config.optimization_metrics[0]}' not supported, "
+            f"using metric='auto' instead.",
+        )
 
     training_params = {k: v for k, v in config.framework_params.items()
                        if not k.startswith('_')}
diff --git a/frameworks/hyperoptsklearn/exec.py b/frameworks/hyperoptsklearn/exec.py
index c4179cae1..44857707b 100644
--- a/frameworks/hyperoptsklearn/exec.py
+++ b/frameworks/hyperoptsklearn/exec.py
@@ -36,7 +36,11 @@ def run(dataset, config):
         r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred)
         rmse=(mean_squared_error, False),
     )
-    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[config.metric] if config.metric in metrics_to_loss_mapping else (None, False)
+
+    loss_fn, continuous_loss_fn = metrics_to_loss_mapping.get(
+        config.optimization_metrics[0],
+        (None, False)
+    )
     if loss_fn is None:
         log.warning("Performance metric %s not supported: defaulting to %s.",
                     config.metric, 'accuracy' if is_classification else 'r2')
diff --git a/frameworks/mljarsupervised/exec.py b/frameworks/mljarsupervised/exec.py
index 653d9cfd6..8ad65815a 100644
--- a/frameworks/mljarsupervised/exec.py
+++ b/frameworks/mljarsupervised/exec.py
@@ -24,7 +24,7 @@ def run(dataset, config):
         logloss='logloss',
         rmse='rmse'
     )
-    eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else "auto"
+    eval_metric = metrics_mapping.get(config.optimization_metrics[0], 'auto')
     
     # Mapping of benchmark task to MLJAR ML task
     problem_mapping = dict(
diff --git a/frameworks/oboe/exec.py b/frameworks/oboe/exec.py
index bed388dc7..f47e43941 100644
--- a/frameworks/oboe/exec.py
+++ b/frameworks/oboe/exec.py
@@ -66,7 +66,7 @@ def run(dataset, config):
     n_cores = config.framework_params.get('_n_cores', config.cores)
 
     log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores))
-    log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric))
+    log.warning(f'We ignore the advice to optimize for: {config.optimization_metrics[0]}.')
 
     aml = AutoLearner(p_type='classification' if is_classification else 'regression',
                       n_cores=n_cores,

From e71bfab637ff912867d7affd54269b3f0ccaa89f Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 7 Jun 2023 12:47:08 +0200
Subject: [PATCH 4/8] Change the default config to use optimization and
 evaluation metrics

---
 amlb/benchmark.py     |  7 +++----
 resources/config.yaml | 18 +++++++++++-------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 0f3ce8115..a6bdc51bf 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -418,13 +418,12 @@ def __init__(self, *, name, fold, seed,
 
     @property
     def evaluation_metrics(self) -> list[str]:
-        return self.optimization_metrics + self._evaluation_metrics
+        return list(set(self.optimization_metrics) | set(self._evaluation_metrics))
 
     def load_default_metrics(self, *, dataset_type: str):
         """ Sets `optimization/evaluation_metrics` based on defaults from config.yaml"""
-        metrics = as_list(rconfig().benchmarks.metrics[dataset_type])
-        self.optimization_metrics = metrics[:1]
-        self._evaluation_metrics = metrics[1:]
+        self.optimization_metrics = as_list(rconfig().benchmarks.optimization_metrics[dataset_type])
+        self._evaluation_metrics = as_list(rconfig().benchmarks.evaluation_metrics[dataset_type])
 
     def __setattr__(self, name, value):
         if name == 'max_runtime_seconds':
diff --git a/resources/config.yaml b/resources/config.yaml
index 4daf8087b..ca9ec6503 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -48,13 +48,17 @@ benchmarks:                     # configuration namespace for the benchmarks def
   os_mem_size_mb: 2048          # the default amount of memory left to the OS when task assigned memory is computed automatically.
   os_vol_size_mb: 2048          # the default amount of volume left to the OS when task volume memory is verified.
   overhead_time_seconds: 3600   # amount of additional time allowed for the job to complete before sending an interruption signal
-  metrics:                      # default metrics by dataset type (as listed by amlb.data.DatasetType),
-                                # only the first metric is optimized by the frameworks,
-                                # the others are computed only for information purpose.
-    binary: ['auc', 'logloss', 'acc', 'balacc']     # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error).
-    multiclass: ['logloss', 'acc', 'balacc']        # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average.
-    regression: ['rmse', 'r2', 'mae']               # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2).
-    timeseries: ['mase', 'mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps']
+  optimization_metrics:  # default metrics to optimize for by dataset type (as listed by amlb.data.DatasetType)
+    binary: ['auc']     # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error).
+    multiclass: ['logloss']  # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average.
+    regression: ['rmse']     # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2).
+    timeseries: ['mase']
+  evaluation_metrics:  # default metrics to report by dataset type (as listed by amlb.data.DatasetType)
+    binary: ['logloss', 'acc', 'balacc']  # available metrics are identical to those described for `optimization_metrics`.
+    multiclass: ['acc', 'balacc']     # any `optimization_metric` is also reported on, duplicating it here is not necessary.
+    regression: ['r2', 'mae']
+    timeseries: ['mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps']
+
   defaults:            # the default constraints, usually overridden by a constraint.
     folds: 10          # the amount of fold-runs executed for each dataset.
     max_runtime_seconds: 3600   # default time allocated to the framework to train a model.

From d07c7571f349e328c80492e56ac4d41b14f74923 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 7 Jun 2023 12:53:18 +0200
Subject: [PATCH 5/8] Return copy of dict instead of modifying self.__dict__

---
 amlb/benchmark.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index a6bdc51bf..c504928b7 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -434,9 +434,7 @@ def __setattr__(self, name, value):
         super().__setattr__(name, value)
 
     def __json__(self):
-        d = self.__dict__
-        d["evaluation_metrics"] = self.evaluation_metrics
-        return d
+        return self.__dict__ | {"evaluation_metrics": self.evaluation_metrics}
 
     def __repr__(self):
         return repr_def(self)

From 53472a43f8437c2abeded7d5d3da3b64c8f3b5cb Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 7 Jun 2023 15:50:34 +0200
Subject: [PATCH 6/8] Remove index_cols as it was not used

---
 amlb/results.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 7504db7af..22bc6b0b7 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -130,8 +130,6 @@ def __init__(self, scores=None, framework_name=None, benchmark_name=None, task_n
 
     @cached
     def as_data_frame(self):
-        # index = ['task', 'framework', 'fold']
-        index = []
         df = (self.scores if is_data_frame(self.scores)
               else to_data_frame([dict(sc) for sc in self.scores]))
         if df.empty:
@@ -139,14 +137,12 @@ def as_data_frame(self):
             return df
         fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version',
                       'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info']
-        fixed_cols = [col for col in fixed_cols if col not in index]
         metrics_cols = [col for col in df.columns
                         if (col in dir(ClassificationResult) or col in dir(RegressionResult))
                         and not col.startswith('_')]
         metrics_cols.sort()
         dynamic_cols = [col for col in df.columns
-                        if col not in index
-                        and col not in fixed_cols
+                        if  col not in fixed_cols
                         and col not in metrics_cols]
         dynamic_cols.sort()
         df = df.reindex(columns=[]+fixed_cols+metrics_cols+dynamic_cols)

From f7989407c8e7a0cc566c38cf9e192cf597381ead Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Wed, 7 Jun 2023 16:28:24 +0200
Subject: [PATCH 7/8] Use metadata to identify metrics in centralized way

---
 amlb/results.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 22bc6b0b7..46faeabdc 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -2,6 +2,7 @@
 **results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``),
 as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``).
 """
+import inspect
 from functools import partial
 import collections
 import io
@@ -10,7 +11,7 @@
 import os
 import re
 import statistics
-from typing import Union
+from typing import Union, Callable
 
 import numpy as np
 from numpy import nan, sort
@@ -137,13 +138,13 @@ def as_data_frame(self):
             return df
         fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version',
                       'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info']
-        metrics_cols = [col for col in df.columns
-                        if (col in dir(ClassificationResult) or col in dir(RegressionResult))
-                        and not col.startswith('_')]
+        metrics_cols = [
+            col for col in df.columns
+            if col in ClassificationResult.metrics() + RegressionResult.metrics()
+        ]
         metrics_cols.sort()
         dynamic_cols = [col for col in df.columns
-                        if  col not in fixed_cols
-                        and col not in metrics_cols]
+                        if col not in fixed_cols + metrics_cols]
         dynamic_cols.sort()
         df = df.reindex(columns=[]+fixed_cols+metrics_cols+dynamic_cols)
         log.debug("Scores columns: %s.", df.columns)
@@ -501,6 +502,14 @@ def evaluate(self, metric):
             eval_res += Namespace(value=nan, higher_is_better=None, message=f"Unsupported metric `{metric}` for {pb_type} problems")
         return eval_res
 
+    @classmethod
+    def metrics(cls) -> list[str]:
+        def has_metric_metadata(fn: Callable) -> bool:
+            return get_metadata(fn, "higher_is_better") is not None
+        return [
+            name for name, _ in inspect.getmembers(cls, predicate=has_metric_metadata)
+        ]
+
 
 class NoResult(Result):
 

From f793472021ac683ef10cb44621afce671799ef49 Mon Sep 17 00:00:00 2001
From: PGijsbers <p.gijsbers@tue.nl>
Date: Fri, 9 Jun 2023 11:37:50 +0200
Subject: [PATCH 8/8] Rewrite to avoid tuples in dataframe/csv

We remove the the `metric` column is now updated to be `optimization_metrics`,
which is a character-separated field listing each metric.
We removed the result column, since it would otherwise contain tuples,
and the data is duplicate with the individual metric columns
(except for the convenience of having an 'higher is better' column,
which we now lose).
---
 amlb/results.py | 40 +++++++++++-----------------------------
 1 file changed, 11 insertions(+), 29 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 46faeabdc..4ed79972b 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -3,7 +3,6 @@
 as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``).
 """
 import inspect
-from functools import partial
 import collections
 import io
 import logging
@@ -136,7 +135,7 @@ def as_data_frame(self):
         if df.empty:
             # avoid dtype conversions during reindexing on empty frame
             return df
-        fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version',
+        fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'optimization_metrics', 'mode', 'version',
                       'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info']
         metrics_cols = [
             col for col in df.columns
@@ -171,9 +170,12 @@ def as_printable_data_frame(self, verbosity=3):
         for col in high_precision_float_cols:
             df[col] = df[col].map("{:.6g}".format).astype(float)
 
+        unique_metrics = (set(metrics.split(",")) for metrics in df['optimization_metrics'].unique())
+        optimized_metrics = set.union(*unique_metrics)
+
         cols = ([] if verbosity == 0
-                else ['task', 'fold', 'framework', 'constraint', 'result', 'metric', 'info'] if verbosity == 1
-                else ['id', 'task', 'fold', 'framework', 'constraint', 'result', 'metric',
+                else ['task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics', 'info'] if verbosity == 1
+                else ['id', 'task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics',
                       'duration', 'seed', 'info'] if verbosity == 2
                 else slice(None))
         return df.loc[:, cols]
@@ -423,11 +425,8 @@ def compute_score(self, result=None, meta_result=None):
             seed=metadata.seed,
             app_version=rget().app_version,
             utc=datetime_iso(),
-            evaluation_metrics=metadata.evaluation_metrics,
             optimization_metrics=metadata.optimization_metrics,
             duration=nan,
-            result_metrics=[],
-            result=[],
         )
         required_meta_res = ['training_duration', 'predict_duration', 'models_count']
         for m in required_meta_res:
@@ -435,30 +434,13 @@ def compute_score(self, result=None, meta_result=None):
         result = self.get_result() if result is None else result
 
         scoring_errors = []
-
-        def do_score(m):
-            score = result.evaluate(m)
+        for metric_ in metadata.evaluation_metrics:
+            score = result.evaluate(metric_)
             if 'message' in score:
                 scoring_errors.append(score.message)
-            return score
-
-        def set_score(score):
-            metric = score.metric if score.higher_is_better else f"neg_{score.metric}"
-            result = score.value if score.higher_is_better else -score.value
-            entry.result_metrics.append(metric)
-            entry.result.append(result)
-
-        for metric in metadata.evaluation_metrics:
-            sc = do_score(metric)
-            entry[metric] = sc.value
-            if metric in entry.optimization_metrics:
-                set_score(sc)
-
-        entry.result = tuple(entry.result)
-        entry.result_metrics = tuple(entry.result_metrics)
-        entry.evaluation_metrics = tuple(entry.evaluation_metrics)
-        entry.optimization_metrics = tuple(entry.optimization_metrics)
-        entry.metric = entry.optimization_metrics
+            entry[metric_] = score.value
+
+        entry.optimization_metrics = ','.join(entry.optimization_metrics)
         entry.info = result.info
         if scoring_errors:
             entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors]))