Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes typing issues using mypy 1/N #1

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import collections
import copy
from typing import Dict, Union
from typing import Dict, Optional, Union

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.logging.hierarchical_logger import hlog
Expand All @@ -18,8 +18,8 @@ def evaluate( # noqa: C901
requests_dict: Dict[RequestType, list[Request]],
docs: Dict[TaskExampleId, Doc],
task_dict: Dict[str, LightevalTask],
override_bs: int = None,
evaluation_tracker: EvaluationTracker = None,
evaluation_tracker: EvaluationTracker,
override_bs: Optional[int] = None,
) -> EvaluationTracker:
"""Instantiate and evaluate a model on a list of tasks.

Expand Down
35 changes: 19 additions & 16 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import re
import time
from dataclasses import asdict, is_dataclass
from datetime import datetime
from datetime import date, datetime
from pathlib import Path
from typing import Optional

from datasets import Dataset, load_dataset
from datasets.utils.metadata import MetadataConfigs
Expand Down Expand Up @@ -249,7 +250,7 @@ def details_to_hub(

self.recreate_metadata_card(repo_id, model_name)

def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: # noqa: C901
def recreate_metadata_card(self, repo_id: str, model_name: Optional[str] = None) -> None: # noqa: C901
"""Fully updates the details repository metadata card for the currently evaluated model

Args:
Expand All @@ -264,7 +265,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
multiple_results = len(results_files) > 1

# Get last eval results date for each task (evals might be non overlapping)
last_eval_date_results = {}
last_eval_date_results: dict[str, date] = {}
for sub_file in parquet_files:
# subfile have this general format:
# `2023-09-03T10-57-04.203304/details_harness|hendrycksTest-us_foreign_policy|5_2023-09-03T10-57-04.203304.parquet`
Expand All @@ -278,27 +279,29 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
# iso_date[13] = iso_date[16] = ':'
iso_date = iso_date[:13] + ":" + iso_date[14:16] + ":" + iso_date[17:]

eval_date = datetime.fromisoformat(iso_date)
eval_date: date = datetime.fromisoformat(iso_date)

last_eval_date_results[task_name] = (
max(last_eval_date_results[task_name], eval_date) if task_name in last_eval_date_results else eval_date
)
max_last_eval_date_results = list(last_eval_date_results.values())[0]
last_eval_date_results_iso: dict[str, str] = {}
# Now we convert them in iso-format
for task in last_eval_date_results:
if max_last_eval_date_results < last_eval_date_results[task]:
max_last_eval_date_results = last_eval_date_results[task]
last_eval_date_results[task] = last_eval_date_results[task].isoformat()
max_last_eval_date_results = max_last_eval_date_results.isoformat()
last_eval_date_results_iso[task] = last_eval_date_results[task].isoformat()

max_last_eval_date_results_iso = max_last_eval_date_results.isoformat()

# Add the YAML for the configs
card_metadata = MetadataConfigs()

# Add the results config and add the result file as a parquet file
for sub_file in parquet_results_files:
eval_date = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "")
sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date)
sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results)
eval_date_for_file = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "")
sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date_for_file)
sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results_iso)

repo_file_name = os.path.basename(sub_file)

Expand Down Expand Up @@ -328,10 +331,10 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
for sub_file in parquet_files:
task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0]
sanitized_task = re.sub(r"\W", "_", task_name)
eval_date = os.path.dirname(sub_file)
sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date)
eval_date_for_file = os.path.dirname(sub_file)
sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date_for_file)
repo_file_name = os.path.join("**", os.path.basename(sub_file))
sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results[task_name])
sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results_iso[task_name])

if multiple_results:
if sanitized_task not in card_metadata:
Expand Down Expand Up @@ -417,7 +420,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:

# Cleanup a little the dataset card
# Get the top results
last_results_file = [f for f in results_files if max_last_eval_date_results.replace(":", "-") in f][0]
last_results_file = [f for f in results_files if max_last_eval_date_results_iso.replace(":", "-") in f][0]
last_results_file_path = hf_hub_url(repo_id=repo_id, filename=last_results_file, repo_type="dataset")
f = load_dataset("json", data_files=last_results_file_path, split="train")
results_dict = f["results"][0]
Expand Down Expand Up @@ -450,7 +453,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
f"To load the details from a run, you can for instance do the following:\n"
f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n'
f"## Latest results\n\n"
f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})'
f'These are the [latest results from run {max_last_eval_date_results_iso}]({last_results_file_path.replace("/resolve/", "/blob/")})'
f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
f'You find each in the results and the "latest" split for each eval):\n\n'
f"```python\n{results_string}\n```",
Expand All @@ -468,7 +471,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
card.push_to_hub(repo_id, repo_type="dataset")

def push_results_to_tensorboard( # noqa: C901
self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
self, results: dict[str, dict[str, float]], details: dict[str, list[DetailsLogger.Detail]]
):
if not is_nanotron_available():
hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
Expand All @@ -492,7 +495,7 @@ def push_results_to_tensorboard( # noqa: C901
path_in_repo="tb",
commit_every=6000, # Very long time so that we can change our files names and trigger push ourselves (see below)
)
bench_averages = {}
bench_averages: dict[str, dict[str, list[float]]] = {}
for name, values in results.items():
splited_name = name.split("|")
if len(splited_name) == 3:
Expand Down
43 changes: 22 additions & 21 deletions src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import collections
import os
import time
from collections import defaultdict
from dataclasses import asdict, dataclass, field
from typing import Optional

import git
import numpy as np
Expand Down Expand Up @@ -48,20 +49,20 @@ class GeneralConfigLogger:
"""

# general
lighteval_sha: str = None
num_fewshot_seeds: int = None
override_batch_size: int = None
max_samples: int = None
job_id: int = None
start_time: float = None
end_time: float = None
total_evaluation_time_secondes: str = None
lighteval_sha: str = ""
num_fewshot_seeds: int = 0
override_batch_size: Optional[int] = None
max_samples: Optional[int] = None
job_id: Optional[int] = None
start_time: float = 0
end_time: float = 0
total_evaluation_time_secondes: str = ""

# model info
model_name: str = None
model_sha: str = None
model_dtype: str = None
model_size: str = None
model_name: str = ""
model_sha: str = ""
model_dtype: str = ""
model_size: str = ""

# Nanotron/Brrr config
config: "BrrrConfig" = None
Expand Down Expand Up @@ -132,8 +133,8 @@ class Detail:
"""

example: str = ""
instruction: str = ""
full_prompt: str = ""
instruction: Optional[str] = None
full_prompt: Optional[str] = None
num_effective_few_shots: int = 0
num_asked_few_shots: int = 0
predictions: list = field(default_factory=list)
Expand Down Expand Up @@ -233,12 +234,12 @@ class CompiledHash:
hash_input_tokens: str = ""
hash_cont_tokens: str = ""

hashes: dict[str, list[Hash]] = collections.defaultdict(list)
compiled_hashes: dict[str, CompiledHash] = collections.defaultdict(CompiledHash)
hashes: dict[str, list[Hash]] = defaultdict(list)
compiled_hashes: dict[str, CompiledHash] = defaultdict(CompiledHash)

# dict of details for each task, i.e. winogrande: [example1_details, example2_details, ...]
details: dict[str, list[Detail]] = collections.defaultdict(list)
compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
details: dict[str, list[Detail]] = defaultdict(list)
compiled_details: dict[str, CompiledDetail] = defaultdict(CompiledDetail)
compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()

def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict) -> None:
Expand Down Expand Up @@ -375,8 +376,8 @@ class MetricsLogger:
Example: {"winogrande|winogrande_xl": {"accuracy": 0.5}}
"""

metrics_values: dict[str, dict[str, list[float]]] = collections.defaultdict(lambda: collections.defaultdict(list))
metric_aggregated: dict[str, dict[str, float]] = collections.defaultdict(lambda: collections.defaultdict(dict))
metrics_values: defaultdict[str, defaultdict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
metric_aggregated: defaultdict[str, defaultdict[str, dict]] = defaultdict(lambda: defaultdict(dict))

def log(self, task_name: str, metrics: dict) -> None:
for metric_name, metric_value in metrics.items():
Expand Down
24 changes: 13 additions & 11 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""This module manages all the metrics occurring at the sample level. The results of said metrics are then aggregated
using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category.
"""
from typing import Optional

import nltk
import numpy as np
from nltk.metrics.distance import edit_distance
Expand All @@ -22,9 +24,9 @@
class ExactMatches:
def __init__(
self,
aggregation_function: callable = None,
normalize_gold: callable = None,
normalize_pred: callable = None,
aggregation_function: Optional[callable] = None,
normalize_gold: Optional[callable] = None,
normalize_pred: Optional[callable] = None,
strip_strings: bool = False,
type_exact_match: str = "full",
):
Expand Down Expand Up @@ -111,9 +113,9 @@ def compute_one_item(
class F1_score:
def __init__(
self,
aggregation_function: callable = None,
normalize_gold: callable = None,
normalize_pred: callable = None,
aggregation_function: Optional[callable] = None,
normalize_gold: Optional[callable] = None,
normalize_pred: Optional[callable] = None,
strip_strings: bool = False,
):
"""An F1 score class. F1 is computed over the bag of words of the golds and predictions.
Expand Down Expand Up @@ -296,9 +298,9 @@ def __init__(
methods: str | list[str],
multiple_golds: bool = False,
bootstrap: bool = False,
normalize_gold: callable = None,
normalize_pred: callable = None,
aggregation_function: callable = None,
normalize_gold: Optional[callable] = None,
normalize_pred: Optional[callable] = None,
aggregation_function: Optional[callable] = None,
):
"""A ROUGE wrapper method. Relies on `rouge_scorer`.

Expand Down Expand Up @@ -388,8 +390,8 @@ def _rouge_score_with_bootsrap(self, golds: list[str], preds: list[str]):
class BertScore:
def __init__(
self,
normalize_gold: callable = None,
normalize_pred: callable = None,
normalize_gold: Optional[callable] = None,
normalize_pred: Optional[callable] = None,
):
"""A BERT scorer class. Relies on some called extracted from `bert-score`. By default, will use the
`microsoft/deberta-large-mnli` as scorer
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/models/brrr_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ def prepare_batch(
input_ids=input_ids, input_mask=input_mask, input_lengths=input_lengths, truncated=truncated, padded=padded
)

def gather(self, output_tensor: torch.Tensor, process_group: dist.ProcessGroup = None) -> torch.Tensor:
def gather(self, output_tensor: torch.Tensor, process_group: Optional[dist.ProcessGroup] = None) -> torch.Tensor:
"""Gather together tensors of (possibly) various size spread on separate GPUs (first exchange the lengths and then pad and gather)"""
if process_group is None:
process_group = self.parallel_context.dp_pg
Expand Down
41 changes: 32 additions & 9 deletions src/lighteval/models/inference_client.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
import asyncio
import math
from typing import Coroutine, List, Tuple, Union
from typing import Coroutine, Tuple, Union

import numpy as np
import requests
from tqdm import tqdm
from transformers import AutoTokenizer

from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
from lighteval.tasks.requests import (
GreedyUntilRequest,
GreedyUntilWithLogitsRequest,
LoglikelihoodRequest,
LoglikelihoodRollingRequest,
LoglikelihoodSingleTokenRequest,
)
from lighteval.utils import NO_TGI_ERROR_MSG, as_list, is_tgi_available


Expand Down Expand Up @@ -40,7 +48,7 @@ def __init__(
self.model_info = requests.get(f"{address}/info").json()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_info["model_id"])

def __process_request_generate(self, request: Tuple[str, Union[Tuple, List]]) -> Coroutine[None, List, str]:
def __process_request_generate(self, request: Tuple[str, Union[Tuple, list]]) -> Coroutine[None, list, str]:
context, stopping_arugments = request

if isinstance(stopping_arugments, tuple):
Expand All @@ -67,11 +75,11 @@ def __process_request_generate(self, request: Tuple[str, Union[Tuple, List]]) ->

return generated_text

async def __process_batch_generate(self, requests: List[Tuple[str, Union[Tuple, List]]]):
async def __process_batch_generate(self, requests: list[Tuple[str, Union[Tuple, list]]]):
return await asyncio.gather(*[self.__process_request_generate(request) for request in requests])

def greedy_until(self, requests: List[Tuple[str, Union[Tuple, List]]], override_bs=None) -> List[str]:
generated_texts: List[str] = []
def greedy_until(self, requests: list[GreedyUntilRequest], override_bs=None) -> list[GenerateReturn]:
generated_texts: list[str] = []

batch_size = override_bs if override_bs > 0 else BATCH_SIZE

Expand All @@ -83,16 +91,16 @@ def greedy_until(self, requests: List[Tuple[str, Union[Tuple, List]]], override_

return generated_texts

def __process_request_logprob(self, request: Tuple[str, str]) -> Coroutine[None, List, str]:
def __process_request_logprob(self, request: Tuple[str, str]) -> Coroutine[None, list, str]:
context, choice = request
out = self.client.generate(context + choice, max_new_tokens=1, decoder_input_details=True)
return out

async def __process_batch_logprob(self, requests: List[Tuple[str, str]]):
async def __process_batch_logprob(self, requests: list[Tuple[str, str]]):
return await asyncio.gather(*[self.__process_request_logprob(request) for request in requests])

def loglikelihood(self, requests: List[Tuple[str, str]], override_bs=None) -> List[Tuple[float, bool]]:
res: List[Tuple[float, bool]] = []
def loglikelihood(self, requests: list[LoglikelihoodRequest], override_bs=None) -> list[LoglikelihoodReturn]:
res: list[Tuple[float, bool]] = []

batch_size = override_bs if override_bs > 0 else BATCH_SIZE

Expand All @@ -117,5 +125,20 @@ def loglikelihood(self, requests: List[Tuple[str, str]], override_bs=None) -> Li

return res

def greedy_until_with_logits(
self, requests: list[GreedyUntilWithLogitsRequest], override_bs=None
) -> list[GenerateReturn]:
raise NotImplementedError("Greedy until with logits is not implemented for TGI")

def loglikelihood_rolling(
self, requests: list[LoglikelihoodRollingRequest], override_bs=None
) -> list[LoglikelihoodReturn]:
raise NotImplementedError("Loglikelihood rolling is not implemented for TGI")

def loglikelihood_single_token(
self, requests: list[LoglikelihoodSingleTokenRequest], override_bs=None
) -> list[LoglikelihoodSingleTokenReturn]:
raise NotImplementedError("Loglikelihood single token is not implemented for TGI")

def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
8 changes: 4 additions & 4 deletions src/lighteval/models/model_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import Optional, Tuple, Union
from typing import Tuple, Union

from lighteval.logging.hierarchical_logger import hlog
from lighteval.models.adapter_model import AdapterModel
Expand All @@ -23,9 +23,9 @@
@dataclass
class ModelInfo:
model_name: str
model_sha: Optional[str] = None
model_dtype: Optional[str] = None
model_size: Optional[str] = None
model_sha: str = ""
model_dtype: str = ""
model_size: str = ""


def load_model( # noqa: C901
Expand Down
Loading
Loading