From 0c1615860f55c609ba665f77555e62f120ce14e2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:33:29 +0100 Subject: [PATCH 1/5] Remove unused param llm_as_prompt_judgement --- src/lighteval/logging/info_loggers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 46d3ab5c7..d812eedfb 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -25,7 +25,7 @@ import os import time from dataclasses import asdict, dataclass, field -from typing import Optional, Union +from typing import Union import git import numpy as np @@ -319,7 +319,6 @@ def log( doc: Doc, outputs: list[ModelResponse], metrics: dict, - llm_as_prompt_judgement: Optional[tuple[str, str]] = None, ) -> None: """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger. @@ -329,8 +328,6 @@ def log( doc (Doc): Current sample that we want to store. outputs (list[ModelResponse]): Model outputs for the current sample metrics (_type_): Model scores for said sample on the current task's metrics. - llm_as_prompt_judgement (tuple[str, str]): Tuple containing the - prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric. """ detail = self.Detail() detail.example = doc.query From aa9b6f3403ab67bb374e8e0bd06b977f0936e1da Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:36:00 +0100 Subject: [PATCH 2/5] Remove unused param bootstrap_iters --- src/lighteval/logging/info_loggers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index d812eedfb..2e0aed436 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -483,14 +483,12 @@ def log(self, task_name: str, metrics: dict) -> None: for metric_name, metric_value in metrics.items(): self.metrics_values[task_name][metric_name].append(metric_value) - def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = 1000): # noqa: C901 + def aggregate(self, task_dict: dict[str, LightevalTask]): # noqa: C901 """ Aggregate the metrics for each task and then for all tasks. Args: task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric - bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000. - """ for task_name, metrics in self.metrics_values.items(): From 387748364a39f831bce7042b672a204b28eafa03 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:40:38 +0100 Subject: [PATCH 3/5] Rename CompiledHash params and align with docstring --- src/lighteval/logging/info_loggers.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 2e0aed436..5edae2ebc 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -289,16 +289,16 @@ class CompiledHash: Hashes the aggregated hash values for all the sample ([`Doc`]) of one task ([`LightevalTask`]) Attributes: - example (str): Aggregated hash of all the [`Doc.query`] hashes for all samples of the current task. - full_prompt (str): Aggregated hash of all the [`Doc.ctx`] hashes for all samples of the current task. + examples (str): Aggregated hash of all the [`Doc.query`] hashes for all samples of the current task. + full_prompts (str): Aggregated hash of all the [`Doc.ctx`] hashes for all samples of the current task. input_tokens (str): Aggregated hash of the aggregated [`Doc.input_tokens`] hashes over all samples of the current task. cont_tokens (str): Aggregated hash of the aggregated [`Doc.generated_tokens`] hashes over all samples of the current task. """ - hash_examples: str = "" - hash_full_prompts: str = "" - hash_input_tokens: str = "" - hash_cont_tokens: str = "" + examples: str = "" + full_prompts: str = "" + input_tokens: str = "" + cont_tokens: str = "" hashes: dict[str, list[Hash]] = field(default_factory=lambda: collections.defaultdict(list)) compiled_hashes: dict[str, CompiledHash] = field( @@ -412,16 +412,16 @@ def aggregate(self): for task_name in self.hashes: compiled_hash = self.CompiledHash() - compiled_hash.hash_examples = xxhash.xxh64( + compiled_hash.examples = xxhash.xxh64( "".join(sorted(q.example for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility - compiled_hash.hash_full_prompts = xxhash.xxh64( + compiled_hash.full_prompts = xxhash.xxh64( "".join(sorted(q.full_prompt for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility - compiled_hash.hash_input_tokens = xxhash.xxh64( + compiled_hash.input_tokens = xxhash.xxh64( "".join(sorted(q.input_tokens for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility - compiled_hash.hash_cont_tokens = xxhash.xxh64( + compiled_hash.cont_tokens = xxhash.xxh64( "".join(sorted(q.cont_tokens for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility self.compiled_hashes[task_name] = compiled_hash From b9d93f398f0baff2a2e06ca0aa4c7f297877ff0b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:43:04 +0100 Subject: [PATCH 4/5] Remove effective_few_shots attribute from docstring --- src/lighteval/logging/info_loggers.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 5edae2ebc..ed6d5652c 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -251,11 +251,7 @@ class CompiledDetailOverAllTasks: non_truncated (int): Total number of samples which did not need prompt truncation to fit the model context size across all tasks padded (int): Number of samples which needed padding during the batching step across all tasks. non_padded (int): Number of samples which did not need padding during the batching step across all tasks. - effective_few_shots (float): Average effective few shots across all samples across all tasks. - effective few shot is the number of few shots actually used to fit the prompt in the model context - length while allowing model generation of the expected size. num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks. - """ hashes: dict = field(default_factory=dict) From 2ab4368634ef63b1f6532019fd034a21889dc410 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:46:07 +0100 Subject: [PATCH 5/5] Fix docstring attribute names --- src/lighteval/logging/info_loggers.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index ed6d5652c..1ba8615ac 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -462,7 +462,7 @@ class MetricsLogger: """Logs the actual scores for each metric of each task. Attributes: - metrics_value (dict[str, dict[str, list[float]]]): Maps each task to its dictionary of metrics to scores for all the example of the task. + metrics_values (dict[str, dict[str, list[float]]]): Maps each task to its dictionary of metrics to scores for all the example of the task. Example: {"winogrande|winogrande_xl": {"accuracy": [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]}} metric_aggregated (dict[str, dict[str, float]]): Maps each task to its dictionary of metrics to aggregated scores over all the example of the task. Example: {"winogrande|winogrande_xl": {"accuracy": 0.5}} @@ -563,8 +563,7 @@ class VersionsLogger: Tasks can have a version number/date, which indicates what is the precise metric definition and dataset version used for an evaluation. Attributes: - version (dict[str, int]): Maps the task names with the task versions. - + versions (dict[str, int]): Maps the task names with the task versions. """ # the versions dict will be a dict of task_name: task_version @@ -580,8 +579,7 @@ class TaskConfigLogger: """Logs the different parameters of the current [`LightevalTask`] of interest. Attributes: - tasks_config (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`] - + tasks_configs (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`] """ tasks_configs: dict[str, LightevalTaskConfig] = field(default_factory=dict)