diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 46d3ab5c7..1ba8615ac 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -25,7 +25,7 @@ import os import time from dataclasses import asdict, dataclass, field -from typing import Optional, Union +from typing import Union import git import numpy as np @@ -251,11 +251,7 @@ class CompiledDetailOverAllTasks: non_truncated (int): Total number of samples which did not need prompt truncation to fit the model context size across all tasks padded (int): Number of samples which needed padding during the batching step across all tasks. non_padded (int): Number of samples which did not need padding during the batching step across all tasks. - effective_few_shots (float): Average effective few shots across all samples across all tasks. - effective few shot is the number of few shots actually used to fit the prompt in the model context - length while allowing model generation of the expected size. num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks. - """ hashes: dict = field(default_factory=dict) @@ -289,16 +285,16 @@ class CompiledHash: Hashes the aggregated hash values for all the sample ([`Doc`]) of one task ([`LightevalTask`]) Attributes: - example (str): Aggregated hash of all the [`Doc.query`] hashes for all samples of the current task. - full_prompt (str): Aggregated hash of all the [`Doc.ctx`] hashes for all samples of the current task. + examples (str): Aggregated hash of all the [`Doc.query`] hashes for all samples of the current task. + full_prompts (str): Aggregated hash of all the [`Doc.ctx`] hashes for all samples of the current task. input_tokens (str): Aggregated hash of the aggregated [`Doc.input_tokens`] hashes over all samples of the current task. cont_tokens (str): Aggregated hash of the aggregated [`Doc.generated_tokens`] hashes over all samples of the current task. """ - hash_examples: str = "" - hash_full_prompts: str = "" - hash_input_tokens: str = "" - hash_cont_tokens: str = "" + examples: str = "" + full_prompts: str = "" + input_tokens: str = "" + cont_tokens: str = "" hashes: dict[str, list[Hash]] = field(default_factory=lambda: collections.defaultdict(list)) compiled_hashes: dict[str, CompiledHash] = field( @@ -319,7 +315,6 @@ def log( doc: Doc, outputs: list[ModelResponse], metrics: dict, - llm_as_prompt_judgement: Optional[tuple[str, str]] = None, ) -> None: """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger. @@ -329,8 +324,6 @@ def log( doc (Doc): Current sample that we want to store. outputs (list[ModelResponse]): Model outputs for the current sample metrics (_type_): Model scores for said sample on the current task's metrics. - llm_as_prompt_judgement (tuple[str, str]): Tuple containing the - prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric. """ detail = self.Detail() detail.example = doc.query @@ -415,16 +408,16 @@ def aggregate(self): for task_name in self.hashes: compiled_hash = self.CompiledHash() - compiled_hash.hash_examples = xxhash.xxh64( + compiled_hash.examples = xxhash.xxh64( "".join(sorted(q.example for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility - compiled_hash.hash_full_prompts = xxhash.xxh64( + compiled_hash.full_prompts = xxhash.xxh64( "".join(sorted(q.full_prompt for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility - compiled_hash.hash_input_tokens = xxhash.xxh64( + compiled_hash.input_tokens = xxhash.xxh64( "".join(sorted(q.input_tokens for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility - compiled_hash.hash_cont_tokens = xxhash.xxh64( + compiled_hash.cont_tokens = xxhash.xxh64( "".join(sorted(q.cont_tokens for q in self.hashes[task_name])) ).hexdigest() # hash of all the hash - sorted for reproducibility self.compiled_hashes[task_name] = compiled_hash @@ -469,7 +462,7 @@ class MetricsLogger: """Logs the actual scores for each metric of each task. Attributes: - metrics_value (dict[str, dict[str, list[float]]]): Maps each task to its dictionary of metrics to scores for all the example of the task. + metrics_values (dict[str, dict[str, list[float]]]): Maps each task to its dictionary of metrics to scores for all the example of the task. Example: {"winogrande|winogrande_xl": {"accuracy": [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]}} metric_aggregated (dict[str, dict[str, float]]): Maps each task to its dictionary of metrics to aggregated scores over all the example of the task. Example: {"winogrande|winogrande_xl": {"accuracy": 0.5}} @@ -486,14 +479,12 @@ def log(self, task_name: str, metrics: dict) -> None: for metric_name, metric_value in metrics.items(): self.metrics_values[task_name][metric_name].append(metric_value) - def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = 1000): # noqa: C901 + def aggregate(self, task_dict: dict[str, LightevalTask]): # noqa: C901 """ Aggregate the metrics for each task and then for all tasks. Args: task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric - bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000. - """ for task_name, metrics in self.metrics_values.items(): @@ -572,8 +563,7 @@ class VersionsLogger: Tasks can have a version number/date, which indicates what is the precise metric definition and dataset version used for an evaluation. Attributes: - version (dict[str, int]): Maps the task names with the task versions. - + versions (dict[str, int]): Maps the task names with the task versions. """ # the versions dict will be a dict of task_name: task_version @@ -589,8 +579,7 @@ class TaskConfigLogger: """Logs the different parameters of the current [`LightevalTask`] of interest. Attributes: - tasks_config (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`] - + tasks_configs (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`] """ tasks_configs: dict[str, LightevalTaskConfig] = field(default_factory=dict)