From 9fa1b275a77d52b6abefc7755d19890b4c769c95 Mon Sep 17 00:00:00 2001 From: Felix Kluge Date: Thu, 6 Jun 2024 14:13:49 +0200 Subject: [PATCH 1/5] initial calculation of performance index --- examples/gsd/_04_gsd_performance_index.py | 168 ++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 examples/gsd/_04_gsd_performance_index.py diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py new file mode 100644 index 000000000..fec13993a --- /dev/null +++ b/examples/gsd/_04_gsd_performance_index.py @@ -0,0 +1,168 @@ +r""" +.. _gsd_performance_index: + +GSD Evaluation - Performance index +============== + +This example shows how to calculate the performance index based on running gait detection on multiple data points. +""" + +# %% +import pandas as pd +from mobgap.data import LabExampleDataset +from mobgap.gsd import GsdIluz + + +# %% +# Running a full evaluation pipeline +# ---------------------------------- +# Instead of manually evaluating and investigating the performance of a GSD algorithm on a single piece of data, we +# often want to run a full evaluation on an entire dataset. +# This can be done using the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class and some ``tpcp`` functions. +# +# But let's start with selecting some data. +# We want to use all the simulated real-world walking data from the INDIP reference system (Test11). +simulated_real_world_walking = LabExampleDataset(reference_system="INDIP").get_subset( + test="Test11" +) + +simulated_real_world_walking +# %% +# Now we can use the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class to directly run a Gsd algorithm on +# a datapoint. +# The pipeline takes care of extracting the required data. +from mobgap.gsd.pipeline import GsdEmulationPipeline + +pipeline = GsdEmulationPipeline(GsdIluz()) + +pipeline.safe_run(simulated_real_world_walking[0]).gs_list_ + +# %% +# Note, that this did just "run" the pipeline on a single datapoint. +# If we want to run it on all datapoints and evaluate the performance of the algorithm, we can use the +# :func:`~tpcp.validate.validate` function. +# +# It uses the build in ``score`` method of the pipeline to calculate the performance of the algorithm on each datapoint +# and then takes the mean of the results. +# All mean and individual results are returned in huge dictionary that can be easily converted to a pandas DataFrame. +from tpcp.validate import validate + +evaluation_results = pd.DataFrame(validate(pipeline, simulated_real_world_walking)) + +evaluation_results_dict = evaluation_results.drop( + ["single_reference", "single_detected"], axis=1 +).T[0] + + +# %% +# Print the list of available scoring metrics +evaluation_results_dict + +# %% +# Calculate performance index +# ------------------------------ +# Bonci et al (2020) (https://www.mdpi.com/1424-8220/20/22/6509) suggest a methodology to determine a performance +# index that combines multiple metrics into a single value. + +import numpy as np + + +def normalize(x, norm_type=None): + """ + Normalize a given array of values based on Bonci et al. + + Parameters: + x (array-like): The input array to be normalized. + type (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit". + If not provided, no normalization is applied. + + Returns: + array-like: The normalized array. + + """ + x = np.array(x) + if norm_type == "cost": + return (max(x) - x) / (max(x) - min(x)) + elif norm_type == "benefit": + return (x - min(x)) / (max(x) - min(x)) + else: + return x + + +# E.g.: +normalize(evaluation_results_dict["single_num_gs_absolute_relative_error_log"], "cost") + +# %% +# Define metrics that are used to calculate the performance index +# For each metric, the underlying score, normalization (cost/benefit/none), aggregation (e.g., mean, std, ...), and weight needs to be defined +weighting_factor_micoamigo = { + "recall_mean": { + "metric": "single_recall", + "normalization": None, + "aggregation": lambda x: np.mean(x), + "weight": 0.117, + }, + "specificity_mean": { + "metric": "single_specificity", + "normalization": None, + "aggregation": lambda x: np.mean(x), + "weight": 0.178, + }, + "precision_mean": { + "metric": "single_precision", + "normalization": None, + "aggregation": lambda x: np.mean(x), + "weight": 0.105, + }, + "accuracy_mean": { + "metric": "single_accuracy", + "normalization": None, + "aggregation": lambda x: np.mean(x), + "weight": 0.160, + }, + "gs_absolute_relative_duration_error_mean": { + "metric": "single_gs_absolute_relative_duration_error_log", + "normalization": "cost", + "aggregation": lambda x: np.mean(x), + "weight": 0.122, + }, + "gs_absolute_relative_duration_error_std": { + "metric": "single_gs_absolute_relative_duration_error_log", + "normalization": "cost", + "aggregation": lambda x: np.std(x), + "weight": 0.122, + }, + # "icc_mean": { + # "metric": ["single_detected_gs_duration_s", "single_reference_gs_duration_s"], + # "normalization": None, + # "aggregation": lambda x: np.intraclass_corr(x), + # "weight": 0.196, + # }, +} + +# Calculate performance index +performance_index = sum( + weighting_factor_micoamigo[key]["aggregation"]( + normalize( + evaluation_results_dict[weighting_factor_micoamigo[key]["metric"]], + norm_type=weighting_factor_micoamigo[key]["normalization"], + ) + ) + * weighting_factor_micoamigo[key]["weight"] + for key in weighting_factor_micoamigo +) + +performance_index + + +# These are the weights used in Kluge et al. +# weighting_factor_kluge = { +# "recall": 0.117, +# "specificity": 0.151, +# "precision": 0.089, +# "accuracy": 0.135, +# "gs_relative_duration_error": 0.104, +# # "Bias_rel_std": 0.104, +# # "ICC_mean": 0.167, +# # "Bias_nr_gs_mean": 0.15, +# } From 68858d16c3e97bd30d4cc2813376ca931adf52ad Mon Sep 17 00:00:00 2001 From: Felix Kluge Date: Fri, 7 Jun 2024 10:16:42 +0200 Subject: [PATCH 2/5] separate criterion and normalization --- examples/gsd/_04_gsd_performance_index.py | 60 ++++++++++++++++------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py index fec13993a..ce7db3d56 100644 --- a/examples/gsd/_04_gsd_performance_index.py +++ b/examples/gsd/_04_gsd_performance_index.py @@ -65,70 +65,95 @@ # index that combines multiple metrics into a single value. import numpy as np +from typing import Literal -def normalize(x, norm_type=None): +def normalize( + x, criterion: Literal["benefit", "cost"] = "benefit", normalization: bool = False +): """ Normalize a given array of values based on Bonci et al. Parameters: - x (array-like): The input array to be normalized. - type (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit". - If not provided, no normalization is applied. + - x (array-like): The input array to be normalized. + - criterion (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit" (default). + - normalization (bool, optional): Whether to perform normalization. If True, the values will be normalized between + the minimum and maximum values in the array. If False, the values will be + normalized between 0 and 1 (e.g., relevant for metrics such as precision, recall, ...). Default is False. Returns: array-like: The normalized array. """ x = np.array(x) - if norm_type == "cost": - return (max(x) - x) / (max(x) - min(x)) - elif norm_type == "benefit": - return (x - min(x)) / (max(x) - min(x)) + + if normalization: + max_val = max(x) + min_val = min(x) + else: + max_val = 1 + min_val = 0 + + if criterion == "cost": + return (max_val - x) / (max_val - min_val) + elif criterion == "benefit": + return (x - min_val) / (max_val - min_val) else: return x # E.g.: -normalize(evaluation_results_dict["single_num_gs_absolute_relative_error_log"], "cost") +normalize( + evaluation_results_dict["single_num_gs_absolute_relative_error_log"], + criterion="cost", + normalization=True, +) +# %% +evaluation_results_dict["single_num_gs_absolute_relative_error_log"] # %% # Define metrics that are used to calculate the performance index -# For each metric, the underlying score, normalization (cost/benefit/none), aggregation (e.g., mean, std, ...), and weight needs to be defined +# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined weighting_factor_micoamigo = { "recall_mean": { "metric": "single_recall", - "normalization": None, + "criterion": "benefit", + "normalization": False, "aggregation": lambda x: np.mean(x), "weight": 0.117, }, "specificity_mean": { "metric": "single_specificity", - "normalization": None, + "criterion": "benefit", + "normalization": False, "aggregation": lambda x: np.mean(x), "weight": 0.178, }, "precision_mean": { "metric": "single_precision", - "normalization": None, + "criterion": "benefit", + "normalization": False, "aggregation": lambda x: np.mean(x), "weight": 0.105, }, "accuracy_mean": { "metric": "single_accuracy", - "normalization": None, + "criterion": "benefit", + "normalization": False, "aggregation": lambda x: np.mean(x), "weight": 0.160, }, "gs_absolute_relative_duration_error_mean": { "metric": "single_gs_absolute_relative_duration_error_log", - "normalization": "cost", + "criterion": "cost", + "normalization": True, "aggregation": lambda x: np.mean(x), "weight": 0.122, }, "gs_absolute_relative_duration_error_std": { "metric": "single_gs_absolute_relative_duration_error_log", - "normalization": "cost", + "criterion": "cost", + "normalization": True, "aggregation": lambda x: np.std(x), "weight": 0.122, }, @@ -145,7 +170,8 @@ def normalize(x, norm_type=None): weighting_factor_micoamigo[key]["aggregation"]( normalize( evaluation_results_dict[weighting_factor_micoamigo[key]["metric"]], - norm_type=weighting_factor_micoamigo[key]["normalization"], + criterion=weighting_factor_micoamigo[key]["criterion"], + normalization=weighting_factor_micoamigo[key]["normalization"], ) ) * weighting_factor_micoamigo[key]["weight"] From 1a5d055320572fa5498b70938f8fb63bc4e3e7b3 Mon Sep 17 00:00:00 2001 From: Felix Kluge Date: Fri, 7 Jun 2024 14:23:42 +0200 Subject: [PATCH 3/5] decouple normalization and cost/benefit; add other normalization methods --- examples/gsd/_04_gsd_performance_index.py | 77 ++++++++++++++--------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py index ce7db3d56..e59edd630 100644 --- a/examples/gsd/_04_gsd_performance_index.py +++ b/examples/gsd/_04_gsd_performance_index.py @@ -68,8 +68,14 @@ from typing import Literal +import numpy as np +from typing import Literal + + def normalize( - x, criterion: Literal["benefit", "cost"] = "benefit", normalization: bool = False + x, + criterion: Literal["benefit", "cost"] = "benefit", + normalization: Literal["minmax", "sigmoid", "exponential", None] = None, ): """ Normalize a given array of values based on Bonci et al. @@ -77,40 +83,51 @@ def normalize( Parameters: - x (array-like): The input array to be normalized. - criterion (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit" (default). - - normalization (bool, optional): Whether to perform normalization. If True, the values will be normalized between - the minimum and maximum values in the array. If False, the values will be - normalized between 0 and 1 (e.g., relevant for metrics such as precision, recall, ...). Default is False. + - normalization (str, optional): Which normalization to perform. Valid options are "minmax", "sigmoid", "exponential", or None (default). Returns: - array-like: The normalized array. + - array-like: The normalized array. + + Raises: + - ValueError: If the criterion is not specified as either 'benefit' or 'cost'. + + Examples: + >>> x = [1, 2, 3, 4, 5] + >>> normalize(x, normalization="minmax") + array([0. , 0.25, 0.5 , 0.75, 1. ]) + >>> normalize(x, criterion="benefit", normalization="sigmoid") + array([0.73105858, 0.88079708, 0.95257413, 0.98201379, 0.99330715]) + + >>> normalize(x, criterion="cost", normalization="sigmoid") + array([0.26894142, 0.11920292, 0.04742587, 0.01798621, 0.00669285]) + + >>> normalize(x, criterion="benefit", normalization="exponential") + array([0.63212056, 0.86466472, 0.95021293, 0.98201417, 0.99326205]) """ x = np.array(x) - if normalization: - max_val = max(x) - min_val = min(x) + if normalization == "minmax": + x_norm = (x - min(x)) / (max(x) - min(x)) + elif normalization == "sigmoid": + x_norm = 1 / (1 + np.exp(-x)) + elif normalization == "exponential": + x_norm = 1 - np.exp(-x) else: - max_val = 1 - min_val = 0 + x_norm = x - if criterion == "cost": - return (max_val - x) / (max_val - min_val) - elif criterion == "benefit": - return (x - min_val) / (max_val - min_val) + if criterion == "benefit": + x_criterion = x_norm + elif criterion == "cost": + x_criterion = 1 - x_norm else: - return x + raise ValueError( + "criterion needs to be specified as either 'benefit' or 'cost'." + ) + return x_criterion -# E.g.: -normalize( - evaluation_results_dict["single_num_gs_absolute_relative_error_log"], - criterion="cost", - normalization=True, -) -# %% -evaluation_results_dict["single_num_gs_absolute_relative_error_log"] # %% # Define metrics that are used to calculate the performance index # For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined @@ -118,42 +135,42 @@ def normalize( "recall_mean": { "metric": "single_recall", "criterion": "benefit", - "normalization": False, + "normalization": None, "aggregation": lambda x: np.mean(x), "weight": 0.117, }, "specificity_mean": { "metric": "single_specificity", "criterion": "benefit", - "normalization": False, + "normalization": None, "aggregation": lambda x: np.mean(x), "weight": 0.178, }, "precision_mean": { "metric": "single_precision", "criterion": "benefit", - "normalization": False, + "normalization": None, "aggregation": lambda x: np.mean(x), "weight": 0.105, }, "accuracy_mean": { "metric": "single_accuracy", "criterion": "benefit", - "normalization": False, + "normalization": None, "aggregation": lambda x: np.mean(x), "weight": 0.160, }, "gs_absolute_relative_duration_error_mean": { "metric": "single_gs_absolute_relative_duration_error_log", "criterion": "cost", - "normalization": True, + "normalization": "exponential", "aggregation": lambda x: np.mean(x), "weight": 0.122, }, "gs_absolute_relative_duration_error_std": { "metric": "single_gs_absolute_relative_duration_error_log", "criterion": "cost", - "normalization": True, + "normalization": "exponential", "aggregation": lambda x: np.std(x), "weight": 0.122, }, @@ -180,7 +197,7 @@ def normalize( performance_index - +# TODO: # These are the weights used in Kluge et al. # weighting_factor_kluge = { # "recall": 0.117, From 172fb71b40bc12d6e790154e550d9add39219f1c Mon Sep 17 00:00:00 2001 From: Felix Kluge Date: Mon, 10 Jun 2024 11:12:13 +0200 Subject: [PATCH 4/5] add full perfomance index calculation --- examples/gsd/_03_gsd_evaluation.py | 158 ++++++++++++++++ examples/gsd/_04_gsd_performance_index.py | 211 ---------------------- mobgap/gsd/evaluation.py | 167 ++++++++++++++++- 3 files changed, 317 insertions(+), 219 deletions(-) delete mode 100644 examples/gsd/_04_gsd_performance_index.py diff --git a/examples/gsd/_03_gsd_evaluation.py b/examples/gsd/_03_gsd_evaluation.py index f0fccc187..2b05f6ba4 100644 --- a/examples/gsd/_03_gsd_evaluation.py +++ b/examples/gsd/_03_gsd_evaluation.py @@ -7,6 +7,7 @@ This example shows how to apply evaluation algorithms to GSD and thus how to rate the performance of a GSD algorithm. """ +# %% import pandas as pd from mobgap.data import LabExampleDataset from mobgap.gsd import GsdIluz @@ -272,3 +273,160 @@ def load_reference(single_test_data): # In general, it is a good idea to use ``cross_validation`` also for algorithms that do not have tunable parameters. # This way you can ensure that the performance of the algorithm is stable across different splits of the data, and it # allows the direct comparison between tunable and non-tunable algorithms. + + +# %% +# Calculate performance index after "Running a full evaluation pipeline" +# ------------------------------ +# Bonci et al (2020) (https://www.mdpi.com/1424-8220/20/22/6509) suggest a methodology to determine a performance +# index that combines multiple metrics into a single value. +import numpy as np +from mobgap.gsd.evaluation import calc_gs_duration_icc, calc_performance_index + +# Get a dictionary of available scoring metrics +evaluation_results_dict = evaluation_results.drop( + ["single_reference", "single_detected"], axis=1 +).T[0] +evaluation_results_dict + + +# %% +# Define metrics that are used to calculate the performance index +# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined +# weighting_factor_micoamigo define the weights of the metrics as suggested by Micó-Amigo (https://pubmed.ncbi.nlm.nih.gov/37316858/) +weighting_factor_micoamigo = { + "recall_mean": { + "metric": "single_recall", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.117, + }, + "specificity_mean": { + "metric": "single_specificity", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.178, + }, + "precision_mean": { + "metric": "single_precision", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.105, + }, + "accuracy_mean": { + "metric": "single_accuracy", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.160, + }, + "gs_absolute_relative_duration_error_mean": { + "metric": "single_gs_absolute_relative_duration_error_log", + "criterion": "cost", + "normalization": "exponential", + "aggregation": np.mean, + "weight": 0.122, + }, + "gs_absolute_relative_duration_error_std": { + "metric": "single_gs_absolute_relative_duration_error_log", + "criterion": "cost", + "normalization": "exponential", + "aggregation": np.std, + "weight": 0.122, + }, + "icc_mean": { + "metric": [ + "single_detected_gs_duration_s", + "single_reference_gs_duration_s", + ], + "criterion": "benefit", + "normalization": None, + "aggregation": calc_gs_duration_icc, + "weight": 0.196, + }, +} + +# %% +# Calculate performance index + +performance_index = calc_performance_index( + evaluation_results=evaluation_results_dict, + weighting_factor=weighting_factor_micoamigo, +) +performance_index + + +# %% +# Define metrics that are used to calculate the performance index +# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined +# weighting_factor_kluge define the weights of the metrics as suggested by Kluge et al (https://doi.org/10.2196/50035) used (see also Multimedia Appendix 1) +weighting_factor_kluge = { + "recall_mean": { + "metric": "single_recall", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.100, + }, + "specificity_mean": { + "metric": "single_specificity", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.151, + }, + "precision_mean": { + "metric": "single_precision", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.089, + }, + "accuracy_mean": { + "metric": "single_accuracy", + "criterion": "benefit", + "normalization": None, + "aggregation": np.mean, + "weight": 0.135, + }, + "gs_absolute_relative_duration_error_mean": { + "metric": "single_gs_absolute_relative_duration_error_log", + "criterion": "cost", + "normalization": "exponential", + "aggregation": np.mean, + "weight": 0.104, + }, + "gs_absolute_relative_duration_error_std": { + "metric": "single_gs_absolute_relative_duration_error_log", + "criterion": "cost", + "normalization": "exponential", + "aggregation": np.std, + "weight": 0.104, + }, + "icc_mean": { + "metric": [ + "single_detected_gs_duration_s", + "single_reference_gs_duration_s", + ], + "criterion": "benefit", + "normalization": None, + "aggregation": calc_gs_duration_icc, + "weight": 0.167, + }, + "gs_nr_mean": { + "metric": "single_num_gs_absolute_relative_error_log", + "criterion": "cost", + "normalization": "exponential", + "aggregation": np.mean, + "weight": 0.150, + }, +} + +performance_index = calc_performance_index( + evaluation_results=evaluation_results_dict, + weighting_factor=weighting_factor_kluge, +) +performance_index diff --git a/examples/gsd/_04_gsd_performance_index.py b/examples/gsd/_04_gsd_performance_index.py deleted file mode 100644 index e59edd630..000000000 --- a/examples/gsd/_04_gsd_performance_index.py +++ /dev/null @@ -1,211 +0,0 @@ -r""" -.. _gsd_performance_index: - -GSD Evaluation - Performance index -============== - -This example shows how to calculate the performance index based on running gait detection on multiple data points. -""" - -# %% -import pandas as pd -from mobgap.data import LabExampleDataset -from mobgap.gsd import GsdIluz - - -# %% -# Running a full evaluation pipeline -# ---------------------------------- -# Instead of manually evaluating and investigating the performance of a GSD algorithm on a single piece of data, we -# often want to run a full evaluation on an entire dataset. -# This can be done using the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class and some ``tpcp`` functions. -# -# But let's start with selecting some data. -# We want to use all the simulated real-world walking data from the INDIP reference system (Test11). -simulated_real_world_walking = LabExampleDataset(reference_system="INDIP").get_subset( - test="Test11" -) - -simulated_real_world_walking -# %% -# Now we can use the :class:`~mobgap.gsd.evaluation.GsdEvaluationPipeline` class to directly run a Gsd algorithm on -# a datapoint. -# The pipeline takes care of extracting the required data. -from mobgap.gsd.pipeline import GsdEmulationPipeline - -pipeline = GsdEmulationPipeline(GsdIluz()) - -pipeline.safe_run(simulated_real_world_walking[0]).gs_list_ - -# %% -# Note, that this did just "run" the pipeline on a single datapoint. -# If we want to run it on all datapoints and evaluate the performance of the algorithm, we can use the -# :func:`~tpcp.validate.validate` function. -# -# It uses the build in ``score`` method of the pipeline to calculate the performance of the algorithm on each datapoint -# and then takes the mean of the results. -# All mean and individual results are returned in huge dictionary that can be easily converted to a pandas DataFrame. -from tpcp.validate import validate - -evaluation_results = pd.DataFrame(validate(pipeline, simulated_real_world_walking)) - -evaluation_results_dict = evaluation_results.drop( - ["single_reference", "single_detected"], axis=1 -).T[0] - - -# %% -# Print the list of available scoring metrics -evaluation_results_dict - -# %% -# Calculate performance index -# ------------------------------ -# Bonci et al (2020) (https://www.mdpi.com/1424-8220/20/22/6509) suggest a methodology to determine a performance -# index that combines multiple metrics into a single value. - -import numpy as np -from typing import Literal - - -import numpy as np -from typing import Literal - - -def normalize( - x, - criterion: Literal["benefit", "cost"] = "benefit", - normalization: Literal["minmax", "sigmoid", "exponential", None] = None, -): - """ - Normalize a given array of values based on Bonci et al. - - Parameters: - - x (array-like): The input array to be normalized. - - criterion (str, optional): The type of normalization to be applied. Valid options are "cost" and "benefit" (default). - - normalization (str, optional): Which normalization to perform. Valid options are "minmax", "sigmoid", "exponential", or None (default). - - Returns: - - array-like: The normalized array. - - Raises: - - ValueError: If the criterion is not specified as either 'benefit' or 'cost'. - - Examples: - >>> x = [1, 2, 3, 4, 5] - >>> normalize(x, normalization="minmax") - array([0. , 0.25, 0.5 , 0.75, 1. ]) - - >>> normalize(x, criterion="benefit", normalization="sigmoid") - array([0.73105858, 0.88079708, 0.95257413, 0.98201379, 0.99330715]) - - >>> normalize(x, criterion="cost", normalization="sigmoid") - array([0.26894142, 0.11920292, 0.04742587, 0.01798621, 0.00669285]) - - >>> normalize(x, criterion="benefit", normalization="exponential") - array([0.63212056, 0.86466472, 0.95021293, 0.98201417, 0.99326205]) - """ - x = np.array(x) - - if normalization == "minmax": - x_norm = (x - min(x)) / (max(x) - min(x)) - elif normalization == "sigmoid": - x_norm = 1 / (1 + np.exp(-x)) - elif normalization == "exponential": - x_norm = 1 - np.exp(-x) - else: - x_norm = x - - if criterion == "benefit": - x_criterion = x_norm - elif criterion == "cost": - x_criterion = 1 - x_norm - else: - raise ValueError( - "criterion needs to be specified as either 'benefit' or 'cost'." - ) - - return x_criterion - - -# %% -# Define metrics that are used to calculate the performance index -# For each metric, the underlying score, criterion (cost/benefit), aggregation (e.g., mean, std, ...), and weight needs to be defined -weighting_factor_micoamigo = { - "recall_mean": { - "metric": "single_recall", - "criterion": "benefit", - "normalization": None, - "aggregation": lambda x: np.mean(x), - "weight": 0.117, - }, - "specificity_mean": { - "metric": "single_specificity", - "criterion": "benefit", - "normalization": None, - "aggregation": lambda x: np.mean(x), - "weight": 0.178, - }, - "precision_mean": { - "metric": "single_precision", - "criterion": "benefit", - "normalization": None, - "aggregation": lambda x: np.mean(x), - "weight": 0.105, - }, - "accuracy_mean": { - "metric": "single_accuracy", - "criterion": "benefit", - "normalization": None, - "aggregation": lambda x: np.mean(x), - "weight": 0.160, - }, - "gs_absolute_relative_duration_error_mean": { - "metric": "single_gs_absolute_relative_duration_error_log", - "criterion": "cost", - "normalization": "exponential", - "aggregation": lambda x: np.mean(x), - "weight": 0.122, - }, - "gs_absolute_relative_duration_error_std": { - "metric": "single_gs_absolute_relative_duration_error_log", - "criterion": "cost", - "normalization": "exponential", - "aggregation": lambda x: np.std(x), - "weight": 0.122, - }, - # "icc_mean": { - # "metric": ["single_detected_gs_duration_s", "single_reference_gs_duration_s"], - # "normalization": None, - # "aggregation": lambda x: np.intraclass_corr(x), - # "weight": 0.196, - # }, -} - -# Calculate performance index -performance_index = sum( - weighting_factor_micoamigo[key]["aggregation"]( - normalize( - evaluation_results_dict[weighting_factor_micoamigo[key]["metric"]], - criterion=weighting_factor_micoamigo[key]["criterion"], - normalization=weighting_factor_micoamigo[key]["normalization"], - ) - ) - * weighting_factor_micoamigo[key]["weight"] - for key in weighting_factor_micoamigo -) - -performance_index - -# TODO: -# These are the weights used in Kluge et al. -# weighting_factor_kluge = { -# "recall": 0.117, -# "specificity": 0.151, -# "precision": 0.089, -# "accuracy": 0.135, -# "gs_relative_duration_error": 0.104, -# # "Bias_rel_std": 0.104, -# # "ICC_mean": 0.167, -# # "Bias_nr_gs_mean": 0.15, -# } diff --git a/mobgap/gsd/evaluation.py b/mobgap/gsd/evaluation.py index b885d70a1..675ddc61c 100644 --- a/mobgap/gsd/evaluation.py +++ b/mobgap/gsd/evaluation.py @@ -10,6 +10,7 @@ from intervaltree.interval import Interval from matplotlib.axes import Axes from matplotlib.figure import Figure +from pingouin import intraclass_corr from typing_extensions import Unpack from mobgap.utils.evaluation import ( @@ -93,7 +94,12 @@ def calculate_matched_gsd_performance_metrics( # estimate performance metrics precision_recall_f1 = precision_recall_f1_score(matches, zero_division=zero_division) - gsd_metrics = {"tp_samples": tp_samples, "fp_samples": fp_samples, "fn_samples": fn_samples, **precision_recall_f1} + gsd_metrics = { + "tp_samples": tp_samples, + "fp_samples": fp_samples, + "fn_samples": fn_samples, + **precision_recall_f1, + } # tn-dependent metrics if tn_samples != 0: @@ -228,7 +234,10 @@ def calculate_unmatched_gsd_performance_metrics( def categorize_intervals( - *, gsd_list_detected: pd.DataFrame, gsd_list_reference: pd.DataFrame, n_overall_samples: Optional[int] = None + *, + gsd_list_detected: pd.DataFrame, + gsd_list_reference: pd.DataFrame, + n_overall_samples: Optional[int] = None, ) -> pd.DataFrame: """ Evaluate detected gait sequence intervals against a reference on a sample-wise level. @@ -367,7 +376,10 @@ def _check_input_sanity( raise TypeError("`gsd_list_detected` and `gsd_list_reference` must be of type `pandas.DataFrame`.") # check if start and end columns are present try: - detected, reference = gsd_list_detected[["start", "end"]], gsd_list_reference[["start", "end"]] + detected, reference = ( + gsd_list_detected[["start", "end"]], + gsd_list_reference[["start", "end"]], + ) except KeyError as e: raise ValueError( "`gsd_list_detected` and `gsd_list_reference` must have columns named 'start' and 'end'." @@ -403,7 +415,10 @@ def _get_false_matches_from_overlap_data(overlaps: list[Interval], interval: Int def find_matches_with_min_overlap( - *, gsd_list_detected: pd.DataFrame, gsd_list_reference: pd.DataFrame, overlap_threshold: float = 0.8 + *, + gsd_list_detected: pd.DataFrame, + gsd_list_reference: pd.DataFrame, + overlap_threshold: float = 0.8, ) -> pd.DataFrame: """ Find all matches of `gsd_list_detected` in `gsd_list_reference` with at least ``overlap_threshold`` overlap. @@ -516,15 +531,35 @@ def _get_tn_intervals(categorized_intervals: pd.DataFrame, n_overall_samples: Un def plot_categorized_intervals( - gsd_list_detected: pd.DataFrame, gsd_list_reference: pd.DataFrame, categorized_intervals: pd.DataFrame + gsd_list_detected: pd.DataFrame, + gsd_list_reference: pd.DataFrame, + categorized_intervals: pd.DataFrame, ) -> Figure: """Plot the categorized intervals together with the detected and reference intervals.""" fig, ax = plt.subplots(figsize=(10, 3)) _plot_intervals_from_df(gsd_list_reference, 3, ax, color="orange") _plot_intervals_from_df(gsd_list_detected, 2, ax, color="blue") - _plot_intervals_from_df(categorized_intervals.query("match_type == 'tp'"), 1, ax, color="green", label="TP") - _plot_intervals_from_df(categorized_intervals.query("match_type == 'fp'"), 1, ax, color="red", label="FP") - _plot_intervals_from_df(categorized_intervals.query("match_type == 'fn'"), 1, ax, color="purple", label="FN") + _plot_intervals_from_df( + categorized_intervals.query("match_type == 'tp'"), + 1, + ax, + color="green", + label="TP", + ) + _plot_intervals_from_df( + categorized_intervals.query("match_type == 'fp'"), + 1, + ax, + color="red", + label="FP", + ) + _plot_intervals_from_df( + categorized_intervals.query("match_type == 'fn'"), + 1, + ax, + color="purple", + label="FN", + ) plt.yticks([1, 2, 3], ["Categorized", "Detected", "Reference"]) plt.ylim(0, 4) plt.xlabel("Index") @@ -546,10 +581,126 @@ def _plot_intervals_from_df(df: pd.DataFrame, y: int, ax: Axes, **kwargs: Unpack ax.hlines(y, row["start"], row["end"], lw=20, **kwargs) +def _normalize( + x: np.ndarray, + criterion: Literal["benefit", "cost"] = "benefit", + normalization: Literal["minmax", "sigmoid", "exponential", None] = None, +) -> np.ndarray: + """ + Normalize a given array of values based on Bonci et al. + + Parameters + ---------- + - x (array-like): The input array to be normalized. + - criterion (str, optional): The type of normalization to be applied. + Valid options are "cost" and "benefit" (default). + - normalization (str, optional): Which normalization to perform. + Valid options are "minmax", "sigmoid", "exponential", or None (default). + + Returns + ------- + - array-like: The normalized array. + + Raises + ------ + - ValueError: If the criterion is not specified as either 'benefit' or 'cost'. + + Examples + -------- + >>> x = [1, 2, 3, 4, 5] + >>> _normalize(x, normalization="minmax") + array([0. , 0.25, 0.5 , 0.75, 1. ]) + + >>> _normalize(x, criterion="benefit", normalization="sigmoid") + array([0.73105858, 0.88079708, 0.95257413, 0.98201379, 0.99330715]) + + >>> _normalize(x, criterion="cost", normalization="sigmoid") + array([0.26894142, 0.11920292, 0.04742587, 0.01798621, 0.00669285]) + + >>> _normalize(x, criterion="benefit", normalization="exponential") + array([0.63212056, 0.86466472, 0.95021293, 0.98201417, 0.99326205]) + """ + x = np.array(x) + + if normalization == "minmax": + x_norm = (x - min(x)) / (max(x) - min(x)) + elif normalization == "sigmoid": + x_norm = 1 / (1 + np.exp(-x)) + elif normalization == "exponential": + x_norm = 1 - np.exp(-x) + else: + x_norm = x + + if criterion == "benefit": + x_criterion = x_norm + elif criterion == "cost": + x_criterion = 1 - x_norm + else: + raise ValueError("criterion needs to be specified as either 'benefit' or 'cost'.") + + return x_criterion + + +def calc_gs_duration_icc(x: np.ndarray) -> float: + """Calculate the Intraclass Correlation Coefficient (ICC) for a given dataset.""" + # Prepare data frame + x_df = ( + pd.DataFrame(x) + .rename(columns={0: "duration_s"}) + .assign(trial_id=lambda df_: df_.duration_s.map(lambda x: range(1, len(x) + 1))) + .explode(["duration_s", "trial_id"]) + .assign( + duration_s=lambda df_: df_.duration_s.astype(float), + trial_id=lambda df_: df_.trial_id.astype(int), + ) + .rename_axis("system") + .reset_index() + ) + # Calculate ICC + x_icc = intraclass_corr(x_df, ratings="duration_s", raters="system", targets="trial_id") + + # Return ICC + return x_icc.loc[0].ICC + + +def calc_performance_index(evaluation_results: dict, weighting_factor: dict) -> float: + """ + Calculate the performance index based on evaluation results and weighting factors. + + Parameters + ---------- + evaluation_results : dict + A dictionary containing the evaluation results for different metrics. + weighting_factor : dict + A dictionary containing the weighting factors for different metrics. + + Returns + ------- + float + The calculated performance index. + + """ + performance_index = sum( + weighting_factor[key]["aggregation"]( + _normalize( + evaluation_results[weighting_factor[key]["metric"]], + criterion=weighting_factor[key]["criterion"], + normalization=weighting_factor[key]["normalization"], + ) + ) + * weighting_factor[key]["weight"] + for key in weighting_factor + ) + + return performance_index + + __all__ = [ "categorize_intervals", "find_matches_with_min_overlap", "calculate_matched_gsd_performance_metrics", "calculate_unmatched_gsd_performance_metrics", "plot_categorized_intervals", + "calc_gs_duration_icc", + "calc_performance_index", ] From 660fce91384c64ec19cd7e3c9f4d13c839e462b1 Mon Sep 17 00:00:00 2001 From: Felix Kluge Date: Mon, 10 Jun 2024 11:15:22 +0200 Subject: [PATCH 5/5] update pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 94f74c12e..d898d2f06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ pywavelets = ">=1.5.0" pooch = ">=1.8.1" # TODO: Remove this dependency at some point gaitmap = ">=2.5.0" +pingouin = "^0.5.4" [tool.poetry.group.dev.dependencies] poethepoet = "^0.22.0"