diff --git a/examples/model_configs/serverless_model_with_openai.yaml b/examples/model_configs/serverless_model_with_openai.yaml new file mode 100644 index 000000000..0e6947dff --- /dev/null +++ b/examples/model_configs/serverless_model_with_openai.yaml @@ -0,0 +1,5 @@ +model: + model_name: "deepseek-ai/DeepSeek-R1" #meta-llama/Llama-3.1-8B-Instruct" #Qwen/Qwen2.5-14B" #Qwen/Qwen2.5-7B" +api: + base_url: "https://huggingface.co/api/inference-proxy/together" + api_key: "hf_" diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 23beda76f..1d1c6a20c 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -26,9 +26,11 @@ from concurrent.futures import ThreadPoolExecutor from typing import Callable, Literal +from pydantic import BaseModel from tqdm import tqdm from lighteval.utils.imports import is_litellm_available, is_openai_available, is_vllm_available +from lighteval.utils.utils import as_list logging.getLogger("openai").setLevel(logging.ERROR) @@ -36,6 +38,9 @@ logger = logging.getLogger(__name__) +DEFAULT_FORMAT = {"type": "text"} + + class JudgeLM: """ A class representing a judge for evaluating answers using either the OpenAI or Transformers library. @@ -76,6 +81,7 @@ def __init__( judge_backend: Literal["litellm", "openai", "transformers", "tgi", "vllm"], url: str | None = None, api_key: str | None = None, + response_format: BaseModel = None, ): self.model = model self.template = templates @@ -91,6 +97,8 @@ def __init__( self.api_key = api_key self.backend = judge_backend + self.response_format = response_format if not None else DEFAULT_FORMAT + def __lazy_load_client(self): match self.backend: # Wether we use openai or TGI models, we go through the openai API @@ -244,16 +252,34 @@ def __call_api_parallel(self, prompts): def __call_api(self, prompt): for _ in range(self.API_MAX_RETRY): try: - response = self.client.chat.completions.create( + # Base model + response = self.client.beta.chat.completions.parse( model=self.model, - messages=prompt, - response_format={"type": "text"}, - max_tokens=512, + messages=as_list(prompt), + response_format=self.response_format, + max_tokens=4096, + temperature=0.0, n=1, ) - text = response.choices[0].message.content - return text + answer = response.choices[0].message.parsed + return answer + except TypeError: + try: + # Finetune + response = self.client.chat.completions.create( + model=self.model, + messages=as_list(prompt), + response_format=self.response_format, + max_tokens=512, + n=1, + ) + text = response.choices[0].message.content + return text + except Exception as e: + logger.warning(f"{type(e), e}") + time.sleep(self.API_RETRY_SLEEP) except Exception as e: logger.warning(f"{type(e), e}") time.sleep(self.API_RETRY_SLEEP) + raise Exception("Failed to get response from the API") diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 352c2b98e..5d0b77f55 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -35,6 +35,7 @@ from nltk.tokenize import word_tokenize from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.translate.bleu_score import sentence_bleu +from pydantic import BaseModel from transformers import AutoModelForSequenceClassification, AutoTokenizer from lighteval.metrics.imports.bert_scorer import BERTScorer @@ -852,7 +853,7 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] + available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4", "gpt-4o-2024-08-06"] def __init__( self, @@ -861,6 +862,7 @@ def __init__( process_judge_response: Callable, judge_backend: Literal["litellm", "openai", "transformers", "vllm", "tgi"], short_judge_name: str | None = None, + response_format: BaseModel = None, ) -> None: match judge_backend: case "openai": @@ -893,6 +895,7 @@ def __init__( api_key=api_key, url=url, judge_backend=judge_backend, + response_format=response_format, ) def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py index 37b8ca347..9dd497e43 100644 --- a/src/lighteval/models/endpoints/openai_model.py +++ b/src/lighteval/models/endpoints/openai_model.py @@ -28,6 +28,7 @@ from typing import Optional from tqdm import tqdm +from transformers import AutoTokenizer from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset from lighteval.models.abstract_model import LightevalModel @@ -64,6 +65,8 @@ class OpenAIModelConfig: model: str generation_parameters: GenerationParameters = None + base_url: str = "https://api.openai.com/v1" + api_key: str = os.environ.get("OPENAI_API_KEY", None) def __post_init__(self): if not self.generation_parameters: @@ -74,17 +77,19 @@ def from_path(cls, path: str) -> "OpenAIModelConfig": import yaml with open(path, "r") as f: - config = yaml.safe_load(f)["model"] + loaded_file = yaml.safe_load(f) + config = loaded_file["model"] + api = loaded_file.get("api", {}) generation_parameters = GenerationParameters.from_dict(config) - return cls(model=config["model_name"], generation_parameters=generation_parameters) + return cls(model=config["model_name"], generation_parameters=generation_parameters, **api) class OpenAIClient(LightevalModel): _DEFAULT_MAX_LENGTH: int = 4096 def __init__(self, config: OpenAIModelConfig, env_config) -> None: - api_key = os.environ["OPENAI_API_KEY"] - self.client = OpenAI(api_key=api_key) + self.client = OpenAI(api_key=config.api_key, base_url=config.base_url) + self.config = config self.generation_parameters = config.generation_parameters self.sampling_params = self.generation_parameters.to_vllm_openai_dict() @@ -99,22 +104,27 @@ def __init__(self, config: OpenAIModelConfig, env_config) -> None: self.API_RETRY_MULTIPLIER = 2 self.CONCURENT_CALLS = 100 self.model = config.model - self._tokenizer = tiktoken.encoding_for_model(self.model) + try: + self._tokenizer = tiktoken.encoding_for_model(self.model) + except KeyError: + self._tokenizer = AutoTokenizer.from_pretrained(self.model) self.pairwise_tokenization = False def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, logit_bias): for _ in range(self.API_MAX_RETRY): try: + response_format = {"response_format": {"type": "text"}} if "openai" in self.config.base_url else {} response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], - response_format={"type": "text"}, max_tokens=max_new_tokens if max_new_tokens > 0 else None, logprobs=return_logits, logit_bias=logit_bias, n=num_samples, **self.sampling_params, + **response_format, ) + self.API_RETRY_SLEEP = 3 return response except Exception as e: logger.warning(f"{type(e), e}") diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py index 97d99188a..f55854651 100644 --- a/src/lighteval/tasks/extended/__init__.py +++ b/src/lighteval/tasks/extended/__init__.py @@ -24,12 +24,13 @@ if can_load_extended_tasks(): + import lighteval.tasks.extended.hle.main as hle import lighteval.tasks.extended.ifeval.main as ifeval import lighteval.tasks.extended.mix_eval.main as mix_eval import lighteval.tasks.extended.mt_bench.main as mt_bench import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks - AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval] + AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, hle] else: AVAILABLE_EXTENDED_TASKS_MODULES = [] diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py new file mode 100644 index 000000000..b0b06315c --- /dev/null +++ b/src/lighteval/tasks/extended/hle/main.py @@ -0,0 +1,227 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import logging +import math +from typing import List, Literal + +import numpy as np +from aenum import extend_enum +from pydantic import BaseModel + +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.metrics_sample import JudgeLLM +from lighteval.metrics.utils.metric_utils import ( + CorpusLevelMetricGrouping, + MetricCategory, + MetricUseCase, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +logger = logging.getLogger(__name__) + + +class ExtractedAnswer(BaseModel): + extracted_final_answer: str + reasoning: str + correct: Literal["yes", "no"] + confidence: int + strict: Literal[True] # 100% reliability + + +"""Adaptation from https://github.com/centerforaisafety/hle/blob/main/hle_eval/run_judge_results.py +""" + + +def get_judge_prompt(question: str, answer: str, gold: str, **kwargs): + return [ + { + "role": "user", + "content": rf"""Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below. + +[question]: {question} + +[response]: {answer} + +Your judgement must be in the format and criteria specified below: + +extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response. + +[correct_answer]: {gold} + +reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match. + +correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect. + + +confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.""", + }, + ] + + +def process_judge_response_hle(response: ExtractedAnswer | List[ExtractedAnswer]): + # todo: add support for batched responses + if isinstance(response, list): + response = response[0] + return { + # "correct_answer": correct_answer, + "model_answer": response.extracted_final_answer, + "reasoning": response.reasoning, + "correct": response.correct, + "confidence": response.confidence, + } + + +class JudgeLLMHLE(JudgeLLM): + def __init__(self): + super().__init__( + judge_model_name="gpt-4o-2024-08-06", + template=get_judge_prompt, + process_judge_response=process_judge_response_hle, + judge_backend="openai", + short_judge_name="hle_judge", + response_format=ExtractedAnswer, + ) + + def compute(self, predictions, formatted_doc: Doc, **kwargs): + # If we are evaluating a multiturn task, we need to have specific field in the formatted doc + gold = formatted_doc.get_golds()[0] + + score, _, _ = self.judge.evaluate_answer(question=formatted_doc.query, answer=predictions[0], gold=gold) + + score["correct_answer"] = gold + return { + "accuracy": score, + "confidence_half_width": score, + "calibration_error": score, + } + + def compute_corpus(self, scores: List[dict]): + n = len(scores) + + correct = [] + confidence = [] + for score in scores: + correct.append("yes" in score["correct"]) + confidence.append(score["confidence"]) + + correct = np.array(correct) + confidence = np.array(confidence) + + # sometimes model collapses on same questions + if len(correct) != n: + print(f"Available predictions: {len(correct)} | Total questions: {n}") + + accuracy = round(100 * sum(correct) / n, 2) + # Wald estimator, 95% confidence interval + confidence_half_width = round(1.96 * math.sqrt(accuracy * (100 - accuracy) / n), 2) + calibration_error = round(calib_err(confidence, correct, p="2", beta=100), 2) + + return { + "accuracy": accuracy, + "confidence_half_width": confidence_half_width, + "calibration_error": calibration_error, + } + + +# source: https://github.com/hendrycks/outlier-exposure/blob/master/utils/calibration_tools.py +def calib_err(confidence, correct, p="2", beta=100): + # beta is target bin size + idxs = np.argsort(confidence) + confidence = confidence[idxs] + correct = correct[idxs] + bins = [[i * beta, (i + 1) * beta] for i in range(len(confidence) // beta)] + if len(bins) == 0: + logger.warning("Error when computing the bins for calibration error") + return -1 + + bins[-1] = [bins[-1][0], len(confidence)] + + cerr = 0 + total_examples = len(confidence) + for i in range(len(bins) - 1): + bin_confidence = confidence[bins[i][0] : bins[i][1]] + bin_correct = correct[bins[i][0] : bins[i][1]] + num_examples_in_bin = len(bin_confidence) + + if num_examples_in_bin > 0: + difference = np.abs(np.nanmean(bin_confidence) - np.nanmean(bin_correct)) + + if p == "2": + cerr += num_examples_in_bin / total_examples * np.square(difference) + elif p == "1": + cerr += num_examples_in_bin / total_examples * difference + elif p == "infty" or p == "infinity" or p == "max": + cerr = np.maximum(cerr, difference) + else: + assert False, "p must be '1', '2', or 'infty'" + + if p == "2": + cerr = np.sqrt(cerr) + + return cerr + + +def hle_text_only(line, task_name: str = None): + if line["image"] not in [None, ""]: + return + + return Doc( + task_name=task_name, + query=f"Question: {line['question']}\nAnswer:", + choices=[line["answer"]], + gold_index=0, + ) + + +hle_metrics = CorpusLevelMetricGrouping( + metric_name=["accuracy", "confidence_half_width", "calibration_error"], + higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]}, + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.ACCURACY, + sample_level_fn=JudgeLLMHLE().compute, + corpus_level_fn=JudgeLLMHLE().compute_corpus, +) +extend_enum(Metrics, "hle_metrics", hle_metrics) + +hle = LightevalTaskConfig( + name="hle", + suite=["lighteval"], + prompt_function=hle_text_only, + hf_repo="cais/hle", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8192, + metric=[Metrics.exact_match, Metrics.hle_metrics], + stop_sequence=[], + trust_dataset=True, + version=0, +) + + +TASKS_TABLE = [hle]