From 0c1615860f55c609ba665f77555e62f120ce14e2 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:33:29 +0100
Subject: [PATCH 1/5] Remove unused param llm_as_prompt_judgement

---
 src/lighteval/logging/info_loggers.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 46d3ab5c7..d812eedfb 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -25,7 +25,7 @@
 import os
 import time
 from dataclasses import asdict, dataclass, field
-from typing import Optional, Union
+from typing import Union
 
 import git
 import numpy as np
@@ -319,7 +319,6 @@ def log(
         doc: Doc,
         outputs: list[ModelResponse],
         metrics: dict,
-        llm_as_prompt_judgement: Optional[tuple[str, str]] = None,
     ) -> None:
         """Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.
 
@@ -329,8 +328,6 @@ def log(
             doc (Doc): Current sample that we want to store.
             outputs (list[ModelResponse]): Model outputs for the current sample
             metrics (_type_): Model scores for said sample on the current task's metrics.
-            llm_as_prompt_judgement (tuple[str, str]): Tuple containing the
-                prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric.
         """
         detail = self.Detail()
         detail.example = doc.query

From aa9b6f3403ab67bb374e8e0bd06b977f0936e1da Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:36:00 +0100
Subject: [PATCH 2/5] Remove unused param bootstrap_iters

---
 src/lighteval/logging/info_loggers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index d812eedfb..2e0aed436 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -483,14 +483,12 @@ def log(self, task_name: str, metrics: dict) -> None:
         for metric_name, metric_value in metrics.items():
             self.metrics_values[task_name][metric_name].append(metric_value)
 
-    def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = 1000):  # noqa: C901
+    def aggregate(self, task_dict: dict[str, LightevalTask]):  # noqa: C901
         """
         Aggregate the metrics for each task and then for all tasks.
 
         Args:
             task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric
-            bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000.
-
         """
 
         for task_name, metrics in self.metrics_values.items():

From 387748364a39f831bce7042b672a204b28eafa03 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:40:38 +0100
Subject: [PATCH 3/5] Rename CompiledHash params and align with docstring

---
 src/lighteval/logging/info_loggers.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 2e0aed436..5edae2ebc 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -289,16 +289,16 @@ class CompiledHash:
         Hashes the aggregated hash values for all the sample ([`Doc`]) of one task ([`LightevalTask`])
 
         Attributes:
-            example (str): Aggregated hash of all the [`Doc.query`] hashes for all samples of the current task.
-            full_prompt (str): Aggregated hash of all the [`Doc.ctx`] hashes for all samples of the current task.
+            examples (str): Aggregated hash of all the [`Doc.query`] hashes for all samples of the current task.
+            full_prompts (str): Aggregated hash of all the [`Doc.ctx`] hashes for all samples of the current task.
             input_tokens (str): Aggregated hash of the aggregated [`Doc.input_tokens`] hashes over all samples of the current task.
             cont_tokens (str): Aggregated hash of the aggregated [`Doc.generated_tokens`] hashes over all samples of the current task.
         """
 
-        hash_examples: str = ""
-        hash_full_prompts: str = ""
-        hash_input_tokens: str = ""
-        hash_cont_tokens: str = ""
+        examples: str = ""
+        full_prompts: str = ""
+        input_tokens: str = ""
+        cont_tokens: str = ""
 
     hashes: dict[str, list[Hash]] = field(default_factory=lambda: collections.defaultdict(list))
     compiled_hashes: dict[str, CompiledHash] = field(
@@ -412,16 +412,16 @@ def aggregate(self):
 
         for task_name in self.hashes:
             compiled_hash = self.CompiledHash()
-            compiled_hash.hash_examples = xxhash.xxh64(
+            compiled_hash.examples = xxhash.xxh64(
                 "".join(sorted(q.example for q in self.hashes[task_name]))
             ).hexdigest()  # hash of all the hash - sorted for reproducibility
-            compiled_hash.hash_full_prompts = xxhash.xxh64(
+            compiled_hash.full_prompts = xxhash.xxh64(
                 "".join(sorted(q.full_prompt for q in self.hashes[task_name]))
             ).hexdigest()  # hash of all the hash - sorted for reproducibility
-            compiled_hash.hash_input_tokens = xxhash.xxh64(
+            compiled_hash.input_tokens = xxhash.xxh64(
                 "".join(sorted(q.input_tokens for q in self.hashes[task_name]))
             ).hexdigest()  # hash of all the hash - sorted for reproducibility
-            compiled_hash.hash_cont_tokens = xxhash.xxh64(
+            compiled_hash.cont_tokens = xxhash.xxh64(
                 "".join(sorted(q.cont_tokens for q in self.hashes[task_name]))
             ).hexdigest()  # hash of all the hash - sorted for reproducibility
             self.compiled_hashes[task_name] = compiled_hash

From b9d93f398f0baff2a2e06ca0aa4c7f297877ff0b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:43:04 +0100
Subject: [PATCH 4/5] Remove effective_few_shots attribute from docstring

---
 src/lighteval/logging/info_loggers.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 5edae2ebc..ed6d5652c 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -251,11 +251,7 @@ class CompiledDetailOverAllTasks:
             non_truncated (int): Total number of samples which did not need prompt truncation to fit the model context size across all tasks
             padded (int): Number of samples which needed padding during the batching step across all tasks.
             non_padded (int): Number of samples which did not need padding during the batching step across all tasks.
-            effective_few_shots (float): Average effective few shots across all samples across all tasks.
-                effective few shot is the number of few shots actually used to fit the prompt in the model context
-                length while allowing model generation of the expected size.
             num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks.
-
         """
 
         hashes: dict = field(default_factory=dict)

From 2ab4368634ef63b1f6532019fd034a21889dc410 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 26 Dec 2024 10:46:07 +0100
Subject: [PATCH 5/5] Fix docstring attribute names

---
 src/lighteval/logging/info_loggers.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index ed6d5652c..1ba8615ac 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -462,7 +462,7 @@ class MetricsLogger:
     """Logs the actual scores for each metric of each task.
 
     Attributes:
-        metrics_value (dict[str, dict[str, list[float]]]): Maps each task to its dictionary of metrics to scores for all the example of the task.
+        metrics_values (dict[str, dict[str, list[float]]]): Maps each task to its dictionary of metrics to scores for all the example of the task.
             Example: {"winogrande|winogrande_xl": {"accuracy": [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]}}
         metric_aggregated (dict[str, dict[str, float]]): Maps each task to its dictionary of metrics to aggregated scores over all the example of the task.
             Example: {"winogrande|winogrande_xl": {"accuracy": 0.5}}
@@ -563,8 +563,7 @@ class VersionsLogger:
     Tasks can have a version number/date, which indicates what is the precise metric definition and dataset version used for an evaluation.
 
     Attributes:
-        version (dict[str, int]): Maps the task names with the task versions.
-
+        versions (dict[str, int]): Maps the task names with the task versions.
     """
 
     # the versions dict will be a dict of task_name: task_version
@@ -580,8 +579,7 @@ class TaskConfigLogger:
     """Logs the different parameters of the current [`LightevalTask`] of interest.
 
     Attributes:
-        tasks_config (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`]
-
+        tasks_configs (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`]
     """
 
     tasks_configs: dict[str, LightevalTaskConfig] = field(default_factory=dict)