From 9fa1b275a77d52b6abefc7755d19890b4c769c95 Mon Sep 17 00:00:00 2001
From: Felix Kluge <felix.kluge@novartis.com>
Date: Thu, 6 Jun 2024 14:13:49 +0200
Subject: [PATCH 1/5] initial calculation of performance index

---
 examples/gsd/_04_gsd_performance_index.py | 168 ++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 examples/gsd/_04_gsd_performance_index.py

diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py
new file mode 100644
index 000000000..fec13993a
--- /dev/null
+++ b/examples/gsd/_04_gsd_performance_index.py
@@ -0,0 +1,168 @@
+r"""
+.. _gsd_performance_index:
+
+GSD Evaluation - Performance index
+==============
+
+This example shows how to calculate the performance index based on running gait detection on multiple data points.
+"""
+
+# %%
+import pandas as pd
+from mobgap.data import LabExampleDataset
+from mobgap.gsd import GsdIluz
+
+
+# %%
+# Running a full evaluation pipeline
+# ----------------------------------
+# Instead of manually evaluating and investigating the performance of a GSD algorithm on a single piece of data, we
+# often want to run a full evaluation on an entire dataset.
+# This can be done using the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class and some ``tpcp`` functions.
+#
+# But let's start with selecting some data.
+# We want to use all the simulated real-world walking data from the INDIP reference system (Test11).
+simulated_real_world_walking = LabExampleDataset(reference_system="INDIP").get_subset(
+    test="Test11"
+)
+
+simulated_real_world_walking
+# %%
+# Now we can use the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class to directly run a Gsd algorithm on
+# a datapoint.
+# The pipeline takes care of extracting the required data.
+from mobgap.gsd.pipeline import GsdEmulationPipeline
+
+pipeline = GsdEmulationPipeline(GsdIluz())
+
+pipeline.safe_run(simulated_real_world_walking[0]).gs_list_
+
+# %%
+# Note, that this did just "run" the pipeline on a single datapoint.
+# If we want to run it on all datapoints and evaluate the performance of the algorithm, we can use the
+# :func:`~tpcp.validate.validate` function.
+#
+# It uses the build in ``score`` method of the pipeline to calculate the performance of the algorithm on each datapoint
+# and then takes the mean of the results.
+# All mean and individual results are returned in huge dictionary that can be easily converted to a pandas DataFrame.
+from tpcp.validate import validate
+
+evaluation_results = pd.DataFrame(validate(pipeline, simulated_real_world_walking))
+
+evaluation_results_dict = evaluation_results.drop(
+    ["single_reference", "single_detected"], axis=1
+).T[0]
+
+
+# %%
+# Print the list of available scoring metrics
+evaluation_results_dict
+
+# %%
+# Calculate performance index
+# ------------------------------
+# Bonci et al (2020) (https://www.mdpi.com/1424-8220/20/22/6509) suggest a methodology to determine a performance
+# index that combines multiple metrics into a single value.
+
+import numpy as np
+
+
+def normalize(x, norm_type=None):
+    """
+    Normalize a given array of values based on Bonci et al.
+
+    Parameters:
+    x (array-like): The input array to be normalized.
+    type (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit".
+                          If not provided, no normalization is applied.
+
+    Returns:
+    array-like: The normalized array.
+
+    """
+    x = np.array(x)
+    if norm_type == "cost":
+        return (max(x) - x) / (max(x) - min(x))
+    elif norm_type == "benefit":
+        return (x - min(x)) / (max(x) - min(x))
+    else:
+        return x
+
+
+# E.g.:
+normalize(evaluation_results_dict["single_num_gs_absolute_relative_error_log"], "cost")
+
+# %%
+# Define metrics that are used to calculate the performance index
+# For each metric, the underlying score, normalization (cost/benefit/none), aggregation (e.g., mean, std, ...), and weight needs to be defined
+weighting_factor_micoamigo = {
+    "recall_mean": {
+        "metric": "single_recall",
+        "normalization": None,
+        "aggregation": lambda x: np.mean(x),
+        "weight": 0.117,
+    },
+    "specificity_mean": {
+        "metric": "single_specificity",
+        "normalization": None,
+        "aggregation": lambda x: np.mean(x),
+        "weight": 0.178,
+    },
+    "precision_mean": {
+        "metric": "single_precision",
+        "normalization": None,
+        "aggregation": lambda x: np.mean(x),
+        "weight": 0.105,
+    },
+    "accuracy_mean": {
+        "metric": "single_accuracy",
+        "normalization": None,
+        "aggregation": lambda x: np.mean(x),
+        "weight": 0.160,
+    },
+    "gs_absolute_relative_duration_error_mean": {
+        "metric": "single_gs_absolute_relative_duration_error_log",
+        "normalization": "cost",
+        "aggregation": lambda x: np.mean(x),
+        "weight": 0.122,
+    },
+    "gs_absolute_relative_duration_error_std": {
+        "metric": "single_gs_absolute_relative_duration_error_log",
+        "normalization": "cost",
+        "aggregation": lambda x: np.std(x),
+        "weight": 0.122,
+    },
+    # "icc_mean": {
+    #     "metric": ["single_detected_gs_duration_s", "single_reference_gs_duration_s"],
+    #     "normalization": None,
+    #     "aggregation": lambda x: np.intraclass_corr(x),
+    #     "weight": 0.196,
+    # },
+}
+
+# Calculate performance index
+performance_index = sum(
+    weighting_factor_micoamigo[key]["aggregation"](
+        normalize(
+            evaluation_results_dict[weighting_factor_micoamigo[key]["metric"]],
+            norm_type=weighting_factor_micoamigo[key]["normalization"],
+        )
+    )
+    * weighting_factor_micoamigo[key]["weight"]
+    for key in weighting_factor_micoamigo
+)
+
+performance_index
+
+
+# These are the weights used in Kluge et al.
+# weighting_factor_kluge = {
+#     "recall": 0.117,
+#     "specificity": 0.151,
+#     "precision": 0.089,
+#     "accuracy": 0.135,
+#     "gs_relative_duration_error": 0.104,
+#     # "Bias_rel_std": 0.104,
+#     # "ICC_mean": 0.167,
+#     # "Bias_nr_gs_mean": 0.15,
+# }

From 68858d16c3e97bd30d4cc2813376ca931adf52ad Mon Sep 17 00:00:00 2001
From: Felix Kluge <felix.kluge@novartis.com>
Date: Fri, 7 Jun 2024 10:16:42 +0200
Subject: [PATCH 2/5] separate criterion and normalization

---
 examples/gsd/_04_gsd_performance_index.py | 60 ++++++++++++++++-------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py
index fec13993a..ce7db3d56 100644
--- a/examples/gsd/_04_gsd_performance_index.py
+++ b/examples/gsd/_04_gsd_performance_index.py
@@ -65,70 +65,95 @@
 # index that combines multiple metrics into a single value.
 
 import numpy as np
+from typing import Literal
 
 
-def normalize(x, norm_type=None):
+def normalize(
+    x, criterion: Literal["benefit", "cost"] = "benefit", normalization: bool = False
+):
     """
     Normalize a given array of values based on Bonci et al.
 
     Parameters:
-    x (array-like): The input array to be normalized.
-    type (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit".
-                          If not provided, no normalization is applied.
+    - x (array-like): The input array to be normalized.
+    - criterion (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit" (default).
+    - normalization (bool, optional): Whether to perform normalization. If True, the values will be normalized between
+                                      the minimum and maximum values in the array. If False, the values will be
+                                      normalized between 0 and 1 (e.g., relevant for metrics such as precision, recall, ...). Default is False.
 
     Returns:
     array-like: The normalized array.
 
     """
     x = np.array(x)
-    if norm_type == "cost":
-        return (max(x) - x) / (max(x) - min(x))
-    elif norm_type == "benefit":
-        return (x - min(x)) / (max(x) - min(x))
+
+    if normalization:
+        max_val = max(x)
+        min_val = min(x)
+    else:
+        max_val = 1
+        min_val = 0
+
+    if criterion == "cost":
+        return (max_val - x) / (max_val - min_val)
+    elif criterion == "benefit":
+        return (x - min_val) / (max_val - min_val)
     else:
         return x
 
 
 # E.g.:
-normalize(evaluation_results_dict["single_num_gs_absolute_relative_error_log"], "cost")
+normalize(
+    evaluation_results_dict["single_num_gs_absolute_relative_error_log"],
+    criterion="cost",
+    normalization=True,
+)
 
+# %%
+evaluation_results_dict["single_num_gs_absolute_relative_error_log"]
 # %%
 # Define metrics that are used to calculate the performance index
-# For each metric, the underlying score, normalization (cost/benefit/none), aggregation (e.g., mean, std, ...), and weight needs to be defined
+# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined
 weighting_factor_micoamigo = {
     "recall_mean": {
         "metric": "single_recall",
-        "normalization": None,
+        "criterion": "benefit",
+        "normalization": False,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.117,
     },
     "specificity_mean": {
         "metric": "single_specificity",
-        "normalization": None,
+        "criterion": "benefit",
+        "normalization": False,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.178,
     },
     "precision_mean": {
         "metric": "single_precision",
-        "normalization": None,
+        "criterion": "benefit",
+        "normalization": False,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.105,
     },
     "accuracy_mean": {
         "metric": "single_accuracy",
-        "normalization": None,
+        "criterion": "benefit",
+        "normalization": False,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.160,
     },
     "gs_absolute_relative_duration_error_mean": {
         "metric": "single_gs_absolute_relative_duration_error_log",
-        "normalization": "cost",
+        "criterion": "cost",
+        "normalization": True,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.122,
     },
     "gs_absolute_relative_duration_error_std": {
         "metric": "single_gs_absolute_relative_duration_error_log",
-        "normalization": "cost",
+        "criterion": "cost",
+        "normalization": True,
         "aggregation": lambda x: np.std(x),
         "weight": 0.122,
     },
@@ -145,7 +170,8 @@ def normalize(x, norm_type=None):
     weighting_factor_micoamigo[key]["aggregation"](
         normalize(
             evaluation_results_dict[weighting_factor_micoamigo[key]["metric"]],
-            norm_type=weighting_factor_micoamigo[key]["normalization"],
+            criterion=weighting_factor_micoamigo[key]["criterion"],
+            normalization=weighting_factor_micoamigo[key]["normalization"],
         )
     )
     * weighting_factor_micoamigo[key]["weight"]

From 1a5d055320572fa5498b70938f8fb63bc4e3e7b3 Mon Sep 17 00:00:00 2001
From: Felix Kluge <felix.kluge@novartis.com>
Date: Fri, 7 Jun 2024 14:23:42 +0200
Subject: [PATCH 3/5] decouple normalization and cost/benefit; add other
 normalization methods

---
 examples/gsd/_04_gsd_performance_index.py | 77 ++++++++++++++---------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py
index ce7db3d56..e59edd630 100644
--- a/examples/gsd/_04_gsd_performance_index.py
+++ b/examples/gsd/_04_gsd_performance_index.py
@@ -68,8 +68,14 @@
 from typing import Literal
 
 
+import numpy as np
+from typing import Literal
+
+
 def normalize(
-    x, criterion: Literal["benefit", "cost"] = "benefit", normalization: bool = False
+    x,
+    criterion: Literal["benefit", "cost"] = "benefit",
+    normalization: Literal["minmax", "sigmoid", "exponential", None] = None,
 ):
     """
     Normalize a given array of values based on Bonci et al.
@@ -77,40 +83,51 @@ def normalize(
     Parameters:
     - x (array-like): The input array to be normalized.
     - criterion (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit" (default).
-    - normalization (bool, optional): Whether to perform normalization. If True, the values will be normalized between
-                                      the minimum and maximum values in the array. If False, the values will be
-                                      normalized between 0 and 1 (e.g., relevant for metrics such as precision, recall, ...). Default is False.
+    - normalization (str, optional): Which normalization to perform. Valid options are "minmax", "sigmoid", "exponential", or None (default).
 
     Returns:
-    array-like: The normalized array.
+    - array-like: The normalized array.
+
+    Raises:
+    - ValueError: If the criterion is not specified as either 'benefit' or 'cost'.
+
+    Examples:
+    >>> x = [1, 2, 3, 4, 5]
+    >>> normalize(x, normalization="minmax")
+    array([0. , 0.25, 0.5 , 0.75, 1. ])
 
+    >>> normalize(x, criterion="benefit", normalization="sigmoid")
+    array([0.73105858, 0.88079708, 0.95257413, 0.98201379, 0.99330715])
+
+    >>> normalize(x, criterion="cost", normalization="sigmoid")
+    array([0.26894142, 0.11920292, 0.04742587, 0.01798621, 0.00669285])
+
+    >>> normalize(x, criterion="benefit", normalization="exponential")
+    array([0.63212056, 0.86466472, 0.95021293, 0.98201417, 0.99326205])
     """
     x = np.array(x)
 
-    if normalization:
-        max_val = max(x)
-        min_val = min(x)
+    if normalization == "minmax":
+        x_norm = (x - min(x)) / (max(x) - min(x))
+    elif normalization == "sigmoid":
+        x_norm = 1 / (1 + np.exp(-x))
+    elif normalization == "exponential":
+        x_norm = 1 - np.exp(-x)
     else:
-        max_val = 1
-        min_val = 0
+        x_norm = x
 
-    if criterion == "cost":
-        return (max_val - x) / (max_val - min_val)
-    elif criterion == "benefit":
-        return (x - min_val) / (max_val - min_val)
+    if criterion == "benefit":
+        x_criterion = x_norm
+    elif criterion == "cost":
+        x_criterion = 1 - x_norm
     else:
-        return x
+        raise ValueError(
+            "criterion needs to be specified as either 'benefit' or 'cost'."
+        )
 
+    return x_criterion
 
-# E.g.:
-normalize(
-    evaluation_results_dict["single_num_gs_absolute_relative_error_log"],
-    criterion="cost",
-    normalization=True,
-)
 
-# %%
-evaluation_results_dict["single_num_gs_absolute_relative_error_log"]
 # %%
 # Define metrics that are used to calculate the performance index
 # For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined
@@ -118,42 +135,42 @@ def normalize(
     "recall_mean": {
         "metric": "single_recall",
         "criterion": "benefit",
-        "normalization": False,
+        "normalization": None,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.117,
     },
     "specificity_mean": {
         "metric": "single_specificity",
         "criterion": "benefit",
-        "normalization": False,
+        "normalization": None,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.178,
     },
     "precision_mean": {
         "metric": "single_precision",
         "criterion": "benefit",
-        "normalization": False,
+        "normalization": None,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.105,
     },
     "accuracy_mean": {
         "metric": "single_accuracy",
         "criterion": "benefit",
-        "normalization": False,
+        "normalization": None,
         "aggregation": lambda x: np.mean(x),
         "weight": 0.160,
     },
     "gs_absolute_relative_duration_error_mean": {
         "metric": "single_gs_absolute_relative_duration_error_log",
         "criterion": "cost",
-        "normalization": True,
+        "normalization": "exponential",
         "aggregation": lambda x: np.mean(x),
         "weight": 0.122,
     },
     "gs_absolute_relative_duration_error_std": {
         "metric": "single_gs_absolute_relative_duration_error_log",
         "criterion": "cost",
-        "normalization": True,
+        "normalization": "exponential",
         "aggregation": lambda x: np.std(x),
         "weight": 0.122,
     },
@@ -180,7 +197,7 @@ def normalize(
 
 performance_index
 
-
+# TODO:
 # These are the weights used in Kluge et al.
 # weighting_factor_kluge = {
 #     "recall": 0.117,

From 172fb71b40bc12d6e790154e550d9add39219f1c Mon Sep 17 00:00:00 2001
From: Felix Kluge <felix.kluge@novartis.com>
Date: Mon, 10 Jun 2024 11:12:13 +0200
Subject: [PATCH 4/5] add full perfomance index calculation

---
 examples/gsd/_03_gsd_evaluation.py        | 158 ++++++++++++++++
 examples/gsd/_04_gsd_performance_index.py | 211 ----------------------
 mobgap/gsd/evaluation.py                  | 167 ++++++++++++++++-
 3 files changed, 317 insertions(+), 219 deletions(-)
 delete mode 100644 examples/gsd/_04_gsd_performance_index.py

diff --git a/examples/gsd/_03_gsd_evaluation.py b/examples/gsd/_03_gsd_evaluation.py
index f0fccc187..2b05f6ba4 100644
--- a/examples/gsd/_03_gsd_evaluation.py
+++ b/examples/gsd/_03_gsd_evaluation.py
@@ -7,6 +7,7 @@
 This example shows how to apply evaluation algorithms to GSD and thus how to rate the performance of a GSD algorithm.
 """
 
+# %%
 import pandas as pd
 from mobgap.data import LabExampleDataset
 from mobgap.gsd import GsdIluz
@@ -272,3 +273,160 @@ def load_reference(single_test_data):
 # In general, it is a good idea to use ``cross_validation`` also for algorithms that do not have tunable parameters.
 # This way you can ensure that the performance of the algorithm is stable across different splits of the data, and it
 # allows the direct comparison between tunable and non-tunable algorithms.
+
+
+# %%
+# Calculate performance index after "Running a full evaluation pipeline"
+# ------------------------------
+# Bonci et al (2020) (https://www.mdpi.com/1424-8220/20/22/6509) suggest a methodology to determine a performance
+# index that combines multiple metrics into a single value.
+import numpy as np
+from mobgap.gsd.evaluation import calc_gs_duration_icc, calc_performance_index
+
+# Get a dictionary of available scoring metrics
+evaluation_results_dict = evaluation_results.drop(
+    ["single_reference", "single_detected"], axis=1
+).T[0]
+evaluation_results_dict
+
+
+# %%
+# Define metrics that are used to calculate the performance index
+# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined
+# weighting_factor_micoamigo define the weights of the metrics as suggested by Micó-Amigo (https://pubmed.ncbi.nlm.nih.gov/37316858/)
+weighting_factor_micoamigo = {
+    "recall_mean": {
+        "metric": "single_recall",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.117,
+    },
+    "specificity_mean": {
+        "metric": "single_specificity",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.178,
+    },
+    "precision_mean": {
+        "metric": "single_precision",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.105,
+    },
+    "accuracy_mean": {
+        "metric": "single_accuracy",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.160,
+    },
+    "gs_absolute_relative_duration_error_mean": {
+        "metric": "single_gs_absolute_relative_duration_error_log",
+        "criterion": "cost",
+        "normalization": "exponential",
+        "aggregation": np.mean,
+        "weight": 0.122,
+    },
+    "gs_absolute_relative_duration_error_std": {
+        "metric": "single_gs_absolute_relative_duration_error_log",
+        "criterion": "cost",
+        "normalization": "exponential",
+        "aggregation": np.std,
+        "weight": 0.122,
+    },
+    "icc_mean": {
+        "metric": [
+            "single_detected_gs_duration_s",
+            "single_reference_gs_duration_s",
+        ],
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": calc_gs_duration_icc,
+        "weight": 0.196,
+    },
+}
+
+# %%
+# Calculate performance index
+
+performance_index = calc_performance_index(
+    evaluation_results=evaluation_results_dict,
+    weighting_factor=weighting_factor_micoamigo,
+)
+performance_index
+
+
+# %%
+# Define metrics that are used to calculate the performance index
+# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined
+# weighting_factor_kluge define the weights of the metrics as suggested by Kluge et al (https://doi.org/10.2196/50035) used (see also Multimedia Appendix 1)
+weighting_factor_kluge = {
+    "recall_mean": {
+        "metric": "single_recall",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.100,
+    },
+    "specificity_mean": {
+        "metric": "single_specificity",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.151,
+    },
+    "precision_mean": {
+        "metric": "single_precision",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.089,
+    },
+    "accuracy_mean": {
+        "metric": "single_accuracy",
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": np.mean,
+        "weight": 0.135,
+    },
+    "gs_absolute_relative_duration_error_mean": {
+        "metric": "single_gs_absolute_relative_duration_error_log",
+        "criterion": "cost",
+        "normalization": "exponential",
+        "aggregation": np.mean,
+        "weight": 0.104,
+    },
+    "gs_absolute_relative_duration_error_std": {
+        "metric": "single_gs_absolute_relative_duration_error_log",
+        "criterion": "cost",
+        "normalization": "exponential",
+        "aggregation": np.std,
+        "weight": 0.104,
+    },
+    "icc_mean": {
+        "metric": [
+            "single_detected_gs_duration_s",
+            "single_reference_gs_duration_s",
+        ],
+        "criterion": "benefit",
+        "normalization": None,
+        "aggregation": calc_gs_duration_icc,
+        "weight": 0.167,
+    },
+    "gs_nr_mean": {
+        "metric": "single_num_gs_absolute_relative_error_log",
+        "criterion": "cost",
+        "normalization": "exponential",
+        "aggregation": np.mean,
+        "weight": 0.150,
+    },
+}
+
+performance_index = calc_performance_index(
+    evaluation_results=evaluation_results_dict,
+    weighting_factor=weighting_factor_kluge,
+)
+performance_index
diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py
deleted file mode 100644
index e59edd630..000000000
--- a/examples/gsd/_04_gsd_performance_index.py
+++ /dev/null
@@ -1,211 +0,0 @@
-r"""
-.. _gsd_performance_index:
-
-GSD Evaluation - Performance index
-==============
-
-This example shows how to calculate the performance index based on running gait detection on multiple data points.
-"""
-
-# %%
-import pandas as pd
-from mobgap.data import LabExampleDataset
-from mobgap.gsd import GsdIluz
-
-
-# %%
-# Running a full evaluation pipeline
-# ----------------------------------
-# Instead of manually evaluating and investigating the performance of a GSD algorithm on a single piece of data, we
-# often want to run a full evaluation on an entire dataset.
-# This can be done using the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class and some ``tpcp`` functions.
-#
-# But let's start with selecting some data.
-# We want to use all the simulated real-world walking data from the INDIP reference system (Test11).
-simulated_real_world_walking = LabExampleDataset(reference_system="INDIP").get_subset(
-    test="Test11"
-)
-
-simulated_real_world_walking
-# %%
-# Now we can use the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class to directly run a Gsd algorithm on
-# a datapoint.
-# The pipeline takes care of extracting the required data.
-from mobgap.gsd.pipeline import GsdEmulationPipeline
-
-pipeline = GsdEmulationPipeline(GsdIluz())
-
-pipeline.safe_run(simulated_real_world_walking[0]).gs_list_
-
-# %%
-# Note, that this did just "run" the pipeline on a single datapoint.
-# If we want to run it on all datapoints and evaluate the performance of the algorithm, we can use the
-# :func:`~tpcp.validate.validate` function.
-#
-# It uses the build in ``score`` method of the pipeline to calculate the performance of the algorithm on each datapoint
-# and then takes the mean of the results.
-# All mean and individual results are returned in huge dictionary that can be easily converted to a pandas DataFrame.
-from tpcp.validate import validate
-
-evaluation_results = pd.DataFrame(validate(pipeline, simulated_real_world_walking))
-
-evaluation_results_dict = evaluation_results.drop(
-    ["single_reference", "single_detected"], axis=1
-).T[0]
-
-
-# %%
-# Print the list of available scoring metrics
-evaluation_results_dict
-
-# %%
-# Calculate performance index
-# ------------------------------
-# Bonci et al (2020) (https://www.mdpi.com/1424-8220/20/22/6509) suggest a methodology to determine a performance
-# index that combines multiple metrics into a single value.
-
-import numpy as np
-from typing import Literal
-
-
-import numpy as np
-from typing import Literal
-
-
-def normalize(
-    x,
-    criterion: Literal["benefit", "cost"] = "benefit",
-    normalization: Literal["minmax", "sigmoid", "exponential", None] = None,
-):
-    """
-    Normalize a given array of values based on Bonci et al.
-
-    Parameters:
-    - x (array-like): The input array to be normalized.
-    - criterion (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit" (default).
-    - normalization (str, optional): Which normalization to perform. Valid options are "minmax", "sigmoid", "exponential", or None (default).
-
-    Returns:
-    - array-like: The normalized array.
-
-    Raises:
-    - ValueError: If the criterion is not specified as either 'benefit' or 'cost'.
-
-    Examples:
-    >>> x = [1, 2, 3, 4, 5]
-    >>> normalize(x, normalization="minmax")
-    array([0. , 0.25, 0.5 , 0.75, 1. ])
-
-    >>> normalize(x, criterion="benefit", normalization="sigmoid")
-    array([0.73105858, 0.88079708, 0.95257413, 0.98201379, 0.99330715])
-
-    >>> normalize(x, criterion="cost", normalization="sigmoid")
-    array([0.26894142, 0.11920292, 0.04742587, 0.01798621, 0.00669285])
-
-    >>> normalize(x, criterion="benefit", normalization="exponential")
-    array([0.63212056, 0.86466472, 0.95021293, 0.98201417, 0.99326205])
-    """
-    x = np.array(x)
-
-    if normalization == "minmax":
-        x_norm = (x - min(x)) / (max(x) - min(x))
-    elif normalization == "sigmoid":
-        x_norm = 1 / (1 + np.exp(-x))
-    elif normalization == "exponential":
-        x_norm = 1 - np.exp(-x)
-    else:
-        x_norm = x
-
-    if criterion == "benefit":
-        x_criterion = x_norm
-    elif criterion == "cost":
-        x_criterion = 1 - x_norm
-    else:
-        raise ValueError(
-            "criterion needs to be specified as either 'benefit' or 'cost'."
-        )
-
-    return x_criterion
-
-
-# %%
-# Define metrics that are used to calculate the performance index
-# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined
-weighting_factor_micoamigo = {
-    "recall_mean": {
-        "metric": "single_recall",
-        "criterion": "benefit",
-        "normalization": None,
-        "aggregation": lambda x: np.mean(x),
-        "weight": 0.117,
-    },
-    "specificity_mean": {
-        "metric": "single_specificity",
-        "criterion": "benefit",
-        "normalization": None,
-        "aggregation": lambda x: np.mean(x),
-        "weight": 0.178,
-    },
-    "precision_mean": {
-        "metric": "single_precision",
-        "criterion": "benefit",
-        "normalization": None,
-        "aggregation": lambda x: np.mean(x),
-        "weight": 0.105,
-    },
-    "accuracy_mean": {
-        "metric": "single_accuracy",
-        "criterion": "benefit",
-        "normalization": None,
-        "aggregation": lambda x: np.mean(x),
-        "weight": 0.160,
-    },
-    "gs_absolute_relative_duration_error_mean": {
-        "metric": "single_gs_absolute_relative_duration_error_log",
-        "criterion": "cost",
-        "normalization": "exponential",
-        "aggregation": lambda x: np.mean(x),
-        "weight": 0.122,
-    },
-    "gs_absolute_relative_duration_error_std": {
-        "metric": "single_gs_absolute_relative_duration_error_log",
-        "criterion": "cost",
-        "normalization": "exponential",
-        "aggregation": lambda x: np.std(x),
-        "weight": 0.122,
-    },
-    # "icc_mean": {
-    #     "metric": ["single_detected_gs_duration_s", "single_reference_gs_duration_s"],
-    #     "normalization": None,
-    #     "aggregation": lambda x: np.intraclass_corr(x),
-    #     "weight": 0.196,
-    # },
-}
-
-# Calculate performance index
-performance_index = sum(
-    weighting_factor_micoamigo[key]["aggregation"](
-        normalize(
-            evaluation_results_dict[weighting_factor_micoamigo[key]["metric"]],
-            criterion=weighting_factor_micoamigo[key]["criterion"],
-            normalization=weighting_factor_micoamigo[key]["normalization"],
-        )
-    )
-    * weighting_factor_micoamigo[key]["weight"]
-    for key in weighting_factor_micoamigo
-)
-
-performance_index
-
-# TODO:
-# These are the weights used in Kluge et al.
-# weighting_factor_kluge = {
-#     "recall": 0.117,
-#     "specificity": 0.151,
-#     "precision": 0.089,
-#     "accuracy": 0.135,
-#     "gs_relative_duration_error": 0.104,
-#     # "Bias_rel_std": 0.104,
-#     # "ICC_mean": 0.167,
-#     # "Bias_nr_gs_mean": 0.15,
-# }
diff --git a/mobgap/gsd/evaluation.py b/mobgap/gsd/evaluation.py
index b885d70a1..675ddc61c 100644
--- a/mobgap/gsd/evaluation.py
+++ b/mobgap/gsd/evaluation.py
@@ -10,6 +10,7 @@
 from intervaltree.interval import Interval
 from matplotlib.axes import Axes
 from matplotlib.figure import Figure
+from pingouin import intraclass_corr
 from typing_extensions import Unpack
 
 from mobgap.utils.evaluation import (
@@ -93,7 +94,12 @@ def calculate_matched_gsd_performance_metrics(
     # estimate performance metrics
     precision_recall_f1 = precision_recall_f1_score(matches, zero_division=zero_division)
 
-    gsd_metrics = {"tp_samples": tp_samples, "fp_samples": fp_samples, "fn_samples": fn_samples, **precision_recall_f1}
+    gsd_metrics = {
+        "tp_samples": tp_samples,
+        "fp_samples": fp_samples,
+        "fn_samples": fn_samples,
+        **precision_recall_f1,
+    }
 
     # tn-dependent metrics
     if tn_samples != 0:
@@ -228,7 +234,10 @@ def calculate_unmatched_gsd_performance_metrics(
 
 
 def categorize_intervals(
-    *, gsd_list_detected: pd.DataFrame, gsd_list_reference: pd.DataFrame, n_overall_samples: Optional[int] = None
+    *,
+    gsd_list_detected: pd.DataFrame,
+    gsd_list_reference: pd.DataFrame,
+    n_overall_samples: Optional[int] = None,
 ) -> pd.DataFrame:
     """
     Evaluate detected gait sequence intervals against a reference on a sample-wise level.
@@ -367,7 +376,10 @@ def _check_input_sanity(
         raise TypeError("`gsd_list_detected` and `gsd_list_reference` must be of type `pandas.DataFrame`.")
     # check if start and end columns are present
     try:
-        detected, reference = gsd_list_detected[["start", "end"]], gsd_list_reference[["start", "end"]]
+        detected, reference = (
+            gsd_list_detected[["start", "end"]],
+            gsd_list_reference[["start", "end"]],
+        )
     except KeyError as e:
         raise ValueError(
             "`gsd_list_detected` and `gsd_list_reference` must have columns named 'start' and 'end'."
@@ -403,7 +415,10 @@ def _get_false_matches_from_overlap_data(overlaps: list[Interval], interval: Int
 
 
 def find_matches_with_min_overlap(
-    *, gsd_list_detected: pd.DataFrame, gsd_list_reference: pd.DataFrame, overlap_threshold: float = 0.8
+    *,
+    gsd_list_detected: pd.DataFrame,
+    gsd_list_reference: pd.DataFrame,
+    overlap_threshold: float = 0.8,
 ) -> pd.DataFrame:
     """
     Find all matches of `gsd_list_detected` in `gsd_list_reference` with at least ``overlap_threshold`` overlap.
@@ -516,15 +531,35 @@ def _get_tn_intervals(categorized_intervals: pd.DataFrame, n_overall_samples: Un
 
 
 def plot_categorized_intervals(
-    gsd_list_detected: pd.DataFrame, gsd_list_reference: pd.DataFrame, categorized_intervals: pd.DataFrame
+    gsd_list_detected: pd.DataFrame,
+    gsd_list_reference: pd.DataFrame,
+    categorized_intervals: pd.DataFrame,
 ) -> Figure:
     """Plot the categorized intervals together with the detected and reference intervals."""
     fig, ax = plt.subplots(figsize=(10, 3))
     _plot_intervals_from_df(gsd_list_reference, 3, ax, color="orange")
     _plot_intervals_from_df(gsd_list_detected, 2, ax, color="blue")
-    _plot_intervals_from_df(categorized_intervals.query("match_type == 'tp'"), 1, ax, color="green", label="TP")
-    _plot_intervals_from_df(categorized_intervals.query("match_type == 'fp'"), 1, ax, color="red", label="FP")
-    _plot_intervals_from_df(categorized_intervals.query("match_type == 'fn'"), 1, ax, color="purple", label="FN")
+    _plot_intervals_from_df(
+        categorized_intervals.query("match_type == 'tp'"),
+        1,
+        ax,
+        color="green",
+        label="TP",
+    )
+    _plot_intervals_from_df(
+        categorized_intervals.query("match_type == 'fp'"),
+        1,
+        ax,
+        color="red",
+        label="FP",
+    )
+    _plot_intervals_from_df(
+        categorized_intervals.query("match_type == 'fn'"),
+        1,
+        ax,
+        color="purple",
+        label="FN",
+    )
     plt.yticks([1, 2, 3], ["Categorized", "Detected", "Reference"])
     plt.ylim(0, 4)
     plt.xlabel("Index")
@@ -546,10 +581,126 @@ def _plot_intervals_from_df(df: pd.DataFrame, y: int, ax: Axes, **kwargs: Unpack
             ax.hlines(y, row["start"], row["end"], lw=20, **kwargs)
 
 
+def _normalize(
+    x: np.ndarray,
+    criterion: Literal["benefit", "cost"] = "benefit",
+    normalization: Literal["minmax", "sigmoid", "exponential", None] = None,
+) -> np.ndarray:
+    """
+    Normalize a given array of values based on Bonci et al.
+
+    Parameters
+    ----------
+    - x (array-like): The input array to be normalized.
+    - criterion (str, optional): The type of normalization to be applied.
+        Valid options are "cost" and "benefit" (default).
+    - normalization (str, optional): Which normalization to perform.
+        Valid options are "minmax", "sigmoid", "exponential", or None (default).
+
+    Returns
+    -------
+    - array-like: The normalized array.
+
+    Raises
+    ------
+    - ValueError: If the criterion is not specified as either 'benefit' or 'cost'.
+
+    Examples
+    --------
+    >>> x = [1, 2, 3, 4, 5]
+    >>> _normalize(x, normalization="minmax")
+    array([0. , 0.25, 0.5 , 0.75, 1. ])
+
+    >>> _normalize(x, criterion="benefit", normalization="sigmoid")
+    array([0.73105858, 0.88079708, 0.95257413, 0.98201379, 0.99330715])
+
+    >>> _normalize(x, criterion="cost", normalization="sigmoid")
+    array([0.26894142, 0.11920292, 0.04742587, 0.01798621, 0.00669285])
+
+    >>> _normalize(x, criterion="benefit", normalization="exponential")
+    array([0.63212056, 0.86466472, 0.95021293, 0.98201417, 0.99326205])
+    """
+    x = np.array(x)
+
+    if normalization == "minmax":
+        x_norm = (x - min(x)) / (max(x) - min(x))
+    elif normalization == "sigmoid":
+        x_norm = 1 / (1 + np.exp(-x))
+    elif normalization == "exponential":
+        x_norm = 1 - np.exp(-x)
+    else:
+        x_norm = x
+
+    if criterion == "benefit":
+        x_criterion = x_norm
+    elif criterion == "cost":
+        x_criterion = 1 - x_norm
+    else:
+        raise ValueError("criterion needs to be specified as either 'benefit' or 'cost'.")
+
+    return x_criterion
+
+
+def calc_gs_duration_icc(x: np.ndarray) -> float:
+    """Calculate the Intraclass Correlation Coefficient (ICC) for a given dataset."""
+    # Prepare data frame
+    x_df = (
+        pd.DataFrame(x)
+        .rename(columns={0: "duration_s"})
+        .assign(trial_id=lambda df_: df_.duration_s.map(lambda x: range(1, len(x) + 1)))
+        .explode(["duration_s", "trial_id"])
+        .assign(
+            duration_s=lambda df_: df_.duration_s.astype(float),
+            trial_id=lambda df_: df_.trial_id.astype(int),
+        )
+        .rename_axis("system")
+        .reset_index()
+    )
+    # Calculate ICC
+    x_icc = intraclass_corr(x_df, ratings="duration_s", raters="system", targets="trial_id")
+
+    # Return ICC
+    return x_icc.loc[0].ICC
+
+
+def calc_performance_index(evaluation_results: dict, weighting_factor: dict) -> float:
+    """
+    Calculate the performance index based on evaluation results and weighting factors.
+
+    Parameters
+    ----------
+    evaluation_results : dict
+        A dictionary containing the evaluation results for different metrics.
+    weighting_factor : dict
+        A dictionary containing the weighting factors for different metrics.
+
+    Returns
+    -------
+    float
+        The calculated performance index.
+
+    """
+    performance_index = sum(
+        weighting_factor[key]["aggregation"](
+            _normalize(
+                evaluation_results[weighting_factor[key]["metric"]],
+                criterion=weighting_factor[key]["criterion"],
+                normalization=weighting_factor[key]["normalization"],
+            )
+        )
+        * weighting_factor[key]["weight"]
+        for key in weighting_factor
+    )
+
+    return performance_index
+
+
 __all__ = [
     "categorize_intervals",
     "find_matches_with_min_overlap",
     "calculate_matched_gsd_performance_metrics",
     "calculate_unmatched_gsd_performance_metrics",
     "plot_categorized_intervals",
+    "calc_gs_duration_icc",
+    "calc_performance_index",
 ]

From 660fce91384c64ec19cd7e3c9f4d13c839e462b1 Mon Sep 17 00:00:00 2001
From: Felix Kluge <felix.kluge@novartis.com>
Date: Mon, 10 Jun 2024 11:15:22 +0200
Subject: [PATCH 5/5] update pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 94f74c12e..d898d2f06 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ pywavelets = ">=1.5.0"
 pooch = ">=1.8.1"
 # TODO: Remove this dependency at some point
 gaitmap = ">=2.5.0"
+pingouin = "^0.5.4"
 
 [tool.poetry.group.dev.dependencies]
 poethepoet = "^0.22.0"