diff --git a/docs/source/package_reference/models.mdx b/docs/source/package_reference/models.mdx index dcf5bc8dc..9feed4652 100644 --- a/docs/source/package_reference/models.mdx +++ b/docs/source/package_reference/models.mdx @@ -6,9 +6,9 @@ ## Accelerate and Transformers Models -### BaseModel -[[autodoc]] models.transformers.base_model.BaseModelConfig -[[autodoc]] models.transformers.base_model.BaseModel +### TransformersModel +[[autodoc]] models.transformers.transformers_model.TransformersModelConfig +[[autodoc]] models.transformers.transformers_model.TransformersModel ### AdapterModel [[autodoc]] models.transformers.adapter_model.AdapterModelConfig diff --git a/examples/model_configs/base_model.yaml b/examples/model_configs/transformers_model.yaml similarity index 76% rename from examples/model_configs/base_model.yaml rename to examples/model_configs/transformers_model.yaml index d6563e616..44e095dd3 100644 --- a/examples/model_configs/base_model.yaml +++ b/examples/model_configs/transformers_model.yaml @@ -1,6 +1,6 @@ model: base_params: - model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... + model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... dtype: "bfloat16" compile: true merged_weights: # Ignore this section if you are not using PEFT models @@ -9,3 +9,4 @@ model: base_model: null # path to the base_model generation: multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing + temperature: 0.5 diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index 4484f7812..e4053813e 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import logging -from logging.config import dictConfig +import logging.config import colorlog import typer @@ -57,7 +57,8 @@ }, ) -dictConfig(logging_config) +logging.config.dictConfig(logging_config) +logging.captureWarnings(capture=True) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate) app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline) diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index 2dd78f445..fe7f98d6f 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -44,7 +44,7 @@ def accelerate( # noqa C901 model_args: Annotated[ str, Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/base_model.yaml)" + help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" ), ], tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], @@ -107,9 +107,10 @@ def accelerate( # noqa C901 from accelerate import Accelerator, InitProcessGroupKwargs from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_input import GenerationParameters from lighteval.models.transformers.adapter_model import AdapterModelConfig - from lighteval.models.transformers.base_model import BaseModelConfig, BitsAndBytesConfig from lighteval.models.transformers.delta_model import DeltaModelConfig + from lighteval.models.transformers.transformers_model import BitsAndBytesConfig, TransformersModelConfig from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) @@ -154,6 +155,8 @@ def accelerate( # noqa C901 # We extract the model args args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")} + args_dict["generation_parameters"] = GenerationParameters.from_dict(config) + # We store the relevant other args args_dict["base_model"] = config["merged_weights"]["base_model"] args_dict["compile"] = bool(config["base_params"]["compile"]) @@ -180,13 +183,13 @@ def accelerate( # noqa C901 elif config["merged_weights"]["base_model"] not in ["", None]: raise ValueError("You can't specify a base model if you are not using delta/adapter weights") else: - model_config = BaseModelConfig(**args_dict) + model_config = TransformersModelConfig(**args_dict) else: model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} model_args_dict["accelerator"] = accelerator model_args_dict["use_chat_template"] = use_chat_template model_args_dict["compile"] = bool(model_args_dict["compile"]) if "compile" in model_args_dict else False - model_config = BaseModelConfig(**model_args_dict) + model_config = TransformersModelConfig(**model_args_dict) pipeline = Pipeline( tasks=tasks, diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index f992d65c9..2c51fe15f 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -42,8 +42,11 @@ @app.command(rich_help_panel="Evaluation Backends") def openai( # === general === - model_name: Annotated[ - str, Argument(help="The model name to evaluate (has to be available through the openai API.") + model_args: Annotated[ + str, + Argument( + help="Model name as a string (has to be available through the openai API) or path to yaml config file (see examples/model_configs/transformers_model.yaml)" + ), ], tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], # === Common parameters === @@ -96,6 +99,11 @@ def openai( from lighteval.models.endpoints.openai_model import OpenAIModelConfig from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters + if model_args.endswith(".yaml"): + model_config = OpenAIModelConfig.from_path(model_args) + else: + model_config = OpenAIModelConfig(model=model_args) + env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) evaluation_tracker = EvaluationTracker( output_dir=output_dir, @@ -107,7 +115,6 @@ def openai( ) parallelism_manager = ParallelismManager.OPENAI - model_config = OpenAIModelConfig(model=model_name) pipeline_params = PipelineParameters( launcher_type=parallelism_manager, @@ -205,7 +212,6 @@ def inference_endpoint( """ Evaluate models using inference-endpoints as backend. """ - from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.models.endpoints.endpoint_model import InferenceEndpointModelConfig, ServerlessEndpointModelConfig from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters @@ -319,7 +325,6 @@ def tgi( """ Evaluate models using TGI as backend. """ - from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.models.endpoints.tgi_model import TGIModelConfig from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index 28c4abdc0..89311b5ae 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -37,7 +37,12 @@ def vllm( # === general === - model_args: Annotated[str, Argument(help="Model arguments in the form key1=value1,key2=value2,...")], + model_args: Annotated[ + str, + Argument( + help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" + ), + ], tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], # === Common parameters === use_chat_template: Annotated[ @@ -88,7 +93,10 @@ def vllm( """ Evaluate models using vllm as backend. """ + import yaml + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_input import GenerationParameters from lighteval.models.vllm.vllm_model import VLLMModelConfig from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters @@ -118,8 +126,15 @@ def vllm( system_prompt=system_prompt, ) - model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} - model_config = VLLMModelConfig(**model_args_dict) + if model_args.endswith(".yaml"): + with open(model_args, "r") as f: + config = yaml.safe_load(f)["model"] + generation_parameters = GenerationParameters.from_dict(config) + model_config = VLLMModelConfig(config, generation_parameters=generation_parameters) + + else: + model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} + model_config = VLLMModelConfig(**model_args_dict) pipeline = Pipeline( tasks=tasks, diff --git a/src/lighteval/models/endpoints/endpoint_model.py b/src/lighteval/models/endpoints/endpoint_model.py index 5bc5da3b7..45cea7f84 100644 --- a/src/lighteval/models/endpoints/endpoint_model.py +++ b/src/lighteval/models/endpoints/endpoint_model.py @@ -24,7 +24,7 @@ import logging import re import time -from dataclasses import dataclass +from dataclasses import dataclass, replace from typing import Coroutine, Dict, List, Optional, Union import requests @@ -35,6 +35,7 @@ InferenceEndpoint, InferenceEndpointError, InferenceEndpointTimeoutError, + TextGenerationInputGenerateParameters, TextGenerationInputGrammarType, TextGenerationOutput, create_inference_endpoint, @@ -48,6 +49,7 @@ from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset from lighteval.models.abstract_model import LightevalModel, ModelInfo +from lighteval.models.model_input import GenerationParameters from lighteval.models.model_output import GenerativeResponse, LoglikelihoodResponse, LoglikelihoodSingleTokenResponse from lighteval.tasks.requests import ( GreedyUntilRequest, @@ -78,6 +80,11 @@ class ServerlessEndpointModelConfig: model_name: str add_special_tokens: bool = True + generation_parameters: GenerationParameters = None + + def __post_init__(self): + if not self.generation_parameters: + self.generation_parameters = GenerationParameters() @classmethod def from_path(cls, path: str) -> "ServerlessEndpointModelConfig": @@ -106,6 +113,7 @@ class InferenceEndpointModelConfig: namespace: str = None # The namespace under which to launch the endpoint. Defaults to the current user's namespace image_url: str = None env_vars: dict = None + generation_parameters: GenerationParameters = None def __post_init__(self): # xor operator, one is None but not the other @@ -117,6 +125,9 @@ def __post_init__(self): if not (self.endpoint_name is None) ^ int(self.model_name is None): raise ValueError("You need to set either endpoint_name or model_name (but not both).") + if not self.generation_parameters: + self.generation_parameters = GenerationParameters() + @classmethod def from_path(cls, path: str) -> "InferenceEndpointModelConfig": """Load configuration for inference endpoint model from YAML file path. @@ -305,6 +316,8 @@ def __init__( # noqa: C901 model_dtype=getattr(config, "model_dtype", "default"), model_size=-1, ) + self.generation_parameters = config.generation_parameters + self.generation_config = TextGenerationInputGenerateParameters(**self.generation_parameters.to_tgi_ie_dict()) @staticmethod def get_larger_hardware_suggestion(cur_instance_type: str = None, cur_instance_size: str = None): @@ -388,16 +401,17 @@ def _async_process_request( ) -> Coroutine[None, list[TextGenerationOutput], str]: # Todo: add an option to launch with conversational instead for chat prompts # https://huggingface.co/docs/huggingface_hub/v0.20.3/en/package_reference/inference_client#huggingface_hub.AsyncInferenceClient.conversational - generated_text = self.async_client.text_generation( - prompt=context, + generation_config: TextGenerationInputGenerateParameters = replace( + self.generation_config, + stop=stop_tokens, + max_new_tokens=max_tokens, details=True, decoder_input_details=True, grammar=grammar, - max_new_tokens=max_tokens, - stop_sequences=stop_tokens, - # truncate=, ) + generated_text = self.async_client.text_generation(prompt=context, generation_config=generation_config) + return generated_text def _process_request( @@ -409,14 +423,18 @@ def _process_request( ) -> TextGenerationOutput: # Todo: add an option to launch with conversational instead for chat prompts # https://huggingface.co/docs/huggingface_hub/v0.20.3/en/package_reference/inference_client#huggingface_hub.AsyncInferenceClient.conversational - generated_text = self.client.text_generation( - prompt=context, + generation_config: TextGenerationInputGenerateParameters = replace( + self.generation_config, + stop=stop_tokens, + max_new_tokens=max_tokens, details=True, decoder_input_details=True, grammar=grammar, - max_new_tokens=max_tokens, - stop_sequences=stop_tokens, - # truncate=, + ) + + generated_text = self.client.text_generation( + prompt=context, + generation_config=generation_config, ) return generated_text diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py index 8733474d0..37b8ca347 100644 --- a/src/lighteval/models/endpoints/openai_model.py +++ b/src/lighteval/models/endpoints/openai_model.py @@ -32,6 +32,7 @@ from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset from lighteval.models.abstract_model import LightevalModel from lighteval.models.endpoints.endpoint_model import ModelInfo +from lighteval.models.model_input import GenerationParameters from lighteval.models.model_output import ( GenerativeResponse, LoglikelihoodResponse, @@ -62,14 +63,30 @@ @dataclass class OpenAIModelConfig: model: str + generation_parameters: GenerationParameters = None + + def __post_init__(self): + if not self.generation_parameters: + self.generation_parameters = GenerationParameters() + + @classmethod + def from_path(cls, path: str) -> "OpenAIModelConfig": + import yaml + + with open(path, "r") as f: + config = yaml.safe_load(f)["model"] + generation_parameters = GenerationParameters.from_dict(config) + return cls(model=config["model_name"], generation_parameters=generation_parameters) class OpenAIClient(LightevalModel): _DEFAULT_MAX_LENGTH: int = 4096 - def __init__(self, config, env_config) -> None: + def __init__(self, config: OpenAIModelConfig, env_config) -> None: api_key = os.environ["OPENAI_API_KEY"] self.client = OpenAI(api_key=api_key) + self.generation_parameters = config.generation_parameters + self.sampling_params = self.generation_parameters.to_vllm_openai_dict() self.model_info = ModelInfo( model_name=config.model, @@ -96,6 +113,7 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, logit_b logprobs=return_logits, logit_bias=logit_bias, n=num_samples, + **self.sampling_params, ) return response except Exception as e: diff --git a/src/lighteval/models/endpoints/tgi_model.py b/src/lighteval/models/endpoints/tgi_model.py index 3f20e4a57..f0bb712b6 100644 --- a/src/lighteval/models/endpoints/tgi_model.py +++ b/src/lighteval/models/endpoints/tgi_model.py @@ -21,14 +21,15 @@ # SOFTWARE. import asyncio -from dataclasses import dataclass +from dataclasses import dataclass, replace from typing import Coroutine, Optional import requests -from huggingface_hub import TextGenerationInputGrammarType, TextGenerationOutput +from huggingface_hub import TextGenerationInputGenerateParameters, TextGenerationInputGrammarType, TextGenerationOutput from transformers import AutoTokenizer from lighteval.models.endpoints.endpoint_model import InferenceEndpointModel, ModelInfo +from lighteval.models.model_input import GenerationParameters from lighteval.utils.imports import NO_TGI_ERROR_MSG, is_tgi_available @@ -50,6 +51,11 @@ class TGIModelConfig: inference_server_address: str inference_server_auth: str model_id: str + generation_parameters: GenerationParameters = None + + def __post_init__(self): + if not self.generation_parameters: + self.generation_parameters = GenerationParameters() @classmethod def from_path(cls, path: str) -> "TGIModelConfig": @@ -65,7 +71,7 @@ def from_path(cls, path: str) -> "TGIModelConfig": with open(path, "r") as f: config = yaml.safe_load(f)["model"] - return cls(**config["instance"]) + return cls(**config["instance"], generation_parameters=GenerationParameters.from_dict(config)) # inherit from InferenceEndpointModel instead of LightevalModel since they both use the same interface, and only overwrite @@ -73,18 +79,22 @@ def from_path(cls, path: str) -> "TGIModelConfig": class ModelClient(InferenceEndpointModel): _DEFAULT_MAX_LENGTH: int = 4096 - def __init__(self, address, auth_token=None, model_id=None) -> None: + def __init__(self, config: TGIModelConfig) -> None: if not is_tgi_available(): raise ImportError(NO_TGI_ERROR_MSG) - headers = {} if auth_token is None else {"Authorization": f"Bearer {auth_token}"} + headers = ( + {} if config.inference_server_auth is None else {"Authorization": f"Bearer {config.inference_server_auth}"} + ) - self.client = AsyncClient(address, headers=headers, timeout=240) + self.client = AsyncClient(config.inference_server_address, headers=headers, timeout=240) + self.generation_parameters = config.generation_parameters + self.generation_config = TextGenerationInputGenerateParameters(**self.generation_parameters.to_tgi_ie_dict()) self._max_gen_toks = 256 - self.model_info = requests.get(f"{address}/info", headers=headers).json() + self.model_info = requests.get(f"{config.inference_server_address}/info", headers=headers).json() if "model_id" not in self.model_info: raise ValueError("Error occured when fetching info: " + str(self.model_info)) - if model_id: - self.model_info["model_id"] = model_id + if config.model_id: + self.model_info["model_id"] = config.model_id self._tokenizer = AutoTokenizer.from_pretrained(self.model_info["model_id"]) self._add_special_tokens = True self.use_async = True @@ -107,14 +117,18 @@ def _async_process_request( grammar: Optional[TextGenerationInputGrammarType] = None, ) -> Coroutine[None, list[TextGenerationOutput], str]: # Todo: add an option to launch with conversational instead for chat prompts - generated_text = self.client.generate( - prompt=context, + # We create a copy of the current text generation params + generation_config: TextGenerationInputGenerateParameters = replace( + self.generation_config, + stop=stop_tokens, + max_new_tokens=max_tokens, + details=True, decoder_input_details=True, grammar=grammar, - max_new_tokens=max_tokens, - stop_sequences=stop_tokens, ) + generated_text = self.client.generate(prompt=context, generation_config=generation_config) + return generated_text def _process_request(self, *args, **kwargs) -> TextGenerationOutput: diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py new file mode 100644 index 000000000..04e35be17 --- /dev/null +++ b/src/lighteval/models/model_input.py @@ -0,0 +1,121 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from dataclasses import asdict, dataclass +from typing import Optional + + +@dataclass +class GenerationParameters: + early_stopping: Optional[bool] = None # vllm, transformers + repetition_penalty: Optional[float] = None # vllm, transformers, tgi + frequency_penalty: Optional[float] = None # vllm, tgi + length_penalty: Optional[float] = None # vllm, transformers + presence_penalty: Optional[float] = None # vllm + + max_new_tokens: Optional[int] = None # vllm, transformers, tgi + min_new_tokens: Optional[int] = None # vllm, transformers + + seed: Optional[int] = None # vllm, tgi + stop_tokens: Optional[list[str]] = None # vllm, transformers, tgi + temperature: Optional[float] = None # vllm, transformers, tgi + top_k: Optional[int] = None # vllm, transformers, tgi + min_p: Optional[float] = None # vllm, transformers + top_p: Optional[int] = None # vllm, transformers, tgi + truncate_prompt: Optional[bool] = None # vllm, tgi + + @classmethod + def from_dict(cls, config_dict: dict): + """Creates a GenerationParameters object from a config dictionary + + Args: + config_dict (dict): Config dictionary. Must obey the following shape: + {"generation": + { + "early_stopping": value, + ... + "truncate_prompt": value + } + } + """ + return GenerationParameters(**config_dict.get("generation", {})) + + def to_vllm_openai_dict(self) -> dict: + """Selects relevant generation and sampling parameters for vllm and openai models. + Doc: https://docs.vllm.ai/en/v0.5.5/dev/sampling_params.html + + Returns: + dict: The parameters to create a vllm.SamplingParams or just provide OpenAI params as such in the model config. + """ + # Task specific sampling params to set in model: n, best_of, use_beam_search + # Generation specific params to set in model: logprobs, prompt_logprobs + return {k: v for k, v in asdict(self).items() if v is not None} + + def to_transformers_dict(self) -> dict: + """Selects relevant generation and sampling parameters for transformers models. + Doc: https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/text_generation#transformers.GenerationConfig + + Note: We actually don't use the GenerationConfig object itself because it has a huge number of parameters automatically + initialized, to a config which slows down evals insanely. + + Returns: + dict: The parameters to create a transformers.GenerationConfig in the model config. + """ + # Task specific sampling params to set in model: do_sample, num_return_sequences, num_beans + args = { + "max_new_tokens": self.max_new_tokens, + "min_new_tokens": self.min_new_tokens, + "early_stopping": self.early_stopping, + "stop_strings": self.stop_tokens, + "temperature": self.temperature, + "top_k": self.top_k, + "top_p": self.top_p, + "min_p": self.min_p, + "repetition_penalty": self.repetition_penalty, + "length_penalty": self.length_penalty, + "output_scores": True, + "return_dict_in_generate": True, + } + return {k: v for k, v in args.items() if v is not None} + + def to_tgi_ie_dict(self) -> dict: + """Selects relevant generation and sampling parameters for tgi or inference endpoints models. + Doc: https://huggingface.co/docs/huggingface_hub/v0.26.3/en/package_reference/inference_types#huggingface_hub.TextGenerationInputGenerateParameters + + Returns: + dict: The parameters to create a huggingface_hub.TextGenerationInputGenerateParameters in the model config. + """ + # Task specific sampling params to set in model: best_of, do_sample + args = { + "decoder_input_details": True, + "details": True, + "frequency_penalty": self.frequency_penalty, + "max_new_tokens": self.max_new_tokens, + "repetition_penalty": self.repetition_penalty, + "seed": self.seed, + "stop": self.stop_tokens, + "temperature": self.temperature, + "top_k": self.top_k, + "top_p": self.top_p, + "truncate": self.truncate_prompt, + } + return {k: v for k, v in args.items() if v is not None} diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index dff3b9b4a..7219b9b95 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -33,8 +33,8 @@ from lighteval.models.endpoints.tgi_model import ModelClient, TGIModelConfig from lighteval.models.litellm_model import LiteLLMClient, LiteLLMModelConfig from lighteval.models.transformers.adapter_model import AdapterModel, AdapterModelConfig -from lighteval.models.transformers.base_model import BaseModel, BaseModelConfig from lighteval.models.transformers.delta_model import DeltaModel, DeltaModelConfig +from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig from lighteval.models.vllm.vllm_model import VLLMModel, VLLMModelConfig from lighteval.utils.imports import ( NO_LITELLM_ERROR_MSG, @@ -53,7 +53,7 @@ def load_model( # noqa: C901 config: Union[ - BaseModelConfig, + TransformersModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, @@ -64,7 +64,7 @@ def load_model( # noqa: C901 LiteLLMModelConfig, ], env_config: EnvConfig, -) -> Union[BaseModel, AdapterModel, DeltaModel, ModelClient, DummyModel]: +) -> Union[TransformersModel, AdapterModel, DeltaModel, ModelClient, DummyModel]: """Will load either a model from an inference server or a model from a checkpoint, depending on the config type. @@ -78,7 +78,7 @@ def load_model( # noqa: C901 ValueError: If you did not specify a base model when using delta weights or adapter weights Returns: - Union[BaseModel, AdapterModel, DeltaModel, ModelClient]: The model that will be evaluated + Union[TransformersModel, AdapterModel, DeltaModel, ModelClient]: The model that will be evaluated """ # Inference server loading if isinstance(config, TGIModelConfig): @@ -87,7 +87,7 @@ def load_model( # noqa: C901 if isinstance(config, InferenceEndpointModelConfig) or isinstance(config, ServerlessEndpointModelConfig): return load_model_with_inference_endpoints(config, env_config=env_config) - if isinstance(config, BaseModelConfig): + if isinstance(config, TransformersModelConfig): return load_model_with_accelerate_or_default(config=config, env_config=env_config) if isinstance(config, DummyModelConfig): @@ -138,7 +138,7 @@ def load_model_with_inference_endpoints(config: InferenceEndpointModelConfig, en def load_model_with_accelerate_or_default( - config: Union[AdapterModelConfig, BaseModelConfig, DeltaModelConfig], env_config: EnvConfig + config: Union[AdapterModelConfig, TransformersModelConfig, DeltaModelConfig], env_config: EnvConfig ): if isinstance(config, AdapterModelConfig): model = AdapterModel(config=config, env_config=env_config) @@ -150,7 +150,7 @@ def load_model_with_accelerate_or_default( model = VLLMModel(config=config, env_config=env_config) return model else: - model = BaseModel(config=config, env_config=env_config) + model = TransformersModel(config=config, env_config=env_config) return model diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py index b7e9b1a5d..5f139174c 100644 --- a/src/lighteval/models/nanotron/nanotron_model.py +++ b/src/lighteval/models/nanotron/nanotron_model.py @@ -48,7 +48,7 @@ LoglikelihoodResponse, LoglikelihoodSingleTokenResponse, ) -from lighteval.models.transformers.base_model import LightevalModel, ModelInfo +from lighteval.models.transformers.transformers_model import LightevalModel, ModelInfo from lighteval.tasks.requests import ( GreedyUntilRequest, LoglikelihoodRequest, diff --git a/src/lighteval/models/transformers/adapter_model.py b/src/lighteval/models/transformers/adapter_model.py index e66a1aa1d..4ce3c7f20 100644 --- a/src/lighteval/models/transformers/adapter_model.py +++ b/src/lighteval/models/transformers/adapter_model.py @@ -27,7 +27,7 @@ import torch from transformers import AutoModelForCausalLM, PreTrainedTokenizer -from lighteval.models.transformers.base_model import BaseModel, BaseModelConfig +from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig from lighteval.models.utils import _get_dtype from lighteval.utils.imports import NO_PEFT_ERROR_MSG, is_peft_available from lighteval.utils.utils import EnvConfig @@ -40,7 +40,7 @@ @dataclass -class AdapterModelConfig(BaseModelConfig): +class AdapterModelConfig(TransformersModelConfig): # Adapter models have the specificity that they look at the base model (= the parent) for the tokenizer and config base_model: str = None @@ -57,7 +57,7 @@ def init_configs(self, env_config: EnvConfig): return self._init_configs(self.base_model, env_config) -class AdapterModel(BaseModel): +class AdapterModel(TransformersModel): def _create_auto_tokenizer(self, config: AdapterModelConfig, env_config: EnvConfig) -> PreTrainedTokenizer: # By default, we look at the model config for the model stored in `base_model` # (= the parent model, not the model of interest) diff --git a/src/lighteval/models/transformers/delta_model.py b/src/lighteval/models/transformers/delta_model.py index 20780f1e7..40a91992a 100644 --- a/src/lighteval/models/transformers/delta_model.py +++ b/src/lighteval/models/transformers/delta_model.py @@ -28,7 +28,7 @@ from tqdm import tqdm from transformers import AutoModelForCausalLM -from lighteval.models.transformers.base_model import BaseModel, BaseModelConfig +from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig from lighteval.models.utils import _get_dtype, _get_model_sha from lighteval.utils.utils import EnvConfig @@ -37,7 +37,7 @@ @dataclass -class DeltaModelConfig(BaseModelConfig): +class DeltaModelConfig(TransformersModelConfig): # Delta models look at the pretrained (= the delta weights) for the tokenizer and model config base_model: str = None @@ -53,7 +53,7 @@ def get_model_sha(self): return _get_model_sha(repo_id=self.pretrained, revision="main") -class DeltaModel(BaseModel): +class DeltaModel(TransformersModel): def _create_auto_model( self, config: DeltaModelConfig, diff --git a/src/lighteval/models/transformers/base_model.py b/src/lighteval/models/transformers/transformers_model.py similarity index 93% rename from src/lighteval/models/transformers/base_model.py rename to src/lighteval/models/transformers/transformers_model.py index b9a958ae1..bed174b4a 100644 --- a/src/lighteval/models/transformers/base_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -22,6 +22,7 @@ import logging import os +import warnings from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -39,10 +40,12 @@ GPTQConfig, PretrainedConfig, ) +from transformers.generation.utils import GenerateOutput, GenerationConfig from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset, LoglikelihoodSingleTokenDataset from lighteval.models.abstract_model import LightevalModel, ModelInfo +from lighteval.models.model_input import GenerationParameters from lighteval.models.model_output import ( Batch, GenerativeMultiturnResponse, @@ -83,7 +86,7 @@ @dataclass -class BaseModelConfig: +class TransformersModelConfig: """ Base configuration class for models. @@ -124,6 +127,8 @@ class BaseModelConfig: model at a quantized precision. Needed for 4-bit and 8-bit precision. trust_remote_code (bool): Whether to trust remote code during model loading. + generation_parameters (GenerationParameters): Range of parameters which will affect the generation. + generation_config (GenerationConfig): GenerationConfig object (only passed during manual creation) Methods: __post_init__(): Performs post-initialization checks on the configuration. @@ -151,6 +156,8 @@ class BaseModelConfig: trust_remote_code: bool = False use_chat_template: bool = False compile: bool = False + generation_parameters: GenerationParameters = None + generation_config: GenerationConfig = None def __post_init__(self): # Making sure this parameter is a boolean @@ -177,6 +184,14 @@ def __post_init__(self): if not isinstance(self.device, str): raise ValueError("Current device must be passed as string.") + if self.generation_config and self.generation_parameters: + raise ValueError( + "Can't use both generation_config and generation_parameters argument. Pass the generation parameters to your generation config object" + ) + + if not self.generation_parameters and not self.generation_config: + self.generation_parameters = GenerationParameters() + def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedConfig: revision = self.revision if self.subfolder: @@ -221,11 +236,22 @@ def get_model_sha(self): return _get_model_sha(repo_id=self.pretrained, revision=self.revision) -class BaseModel(LightevalModel): +@dataclass +class BaseModelConfig(TransformersModelConfig): + def __post_init__(self): + super().__post_init__() + + warnings.warn( + "BaseModelConfig is deprecated and will be removed. Use TransformersModelConfig instead", + FutureWarning, + ) + + +class TransformersModel(LightevalModel): def __init__( self, env_config: EnvConfig, - config: BaseModelConfig, + config: TransformersModelConfig, ): """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.""" self._config = config.init_configs(env_config) @@ -249,13 +275,21 @@ def __init__( logger.info(f"Using Data Parallelism, putting model on device {self._device}") self.model = self.model.to(self._device) if config.compile: - logger.info("Compiling the model") - self.model.model.compile() + try: + logger.info("Compiling the model") + self.model.model.compile() + except AttributeError as e: + logger.warning("Could not compile the model because: ", e) self.model_name = _simplify_name(config.pretrained) self.model_sha = config.get_model_sha() self.precision = _get_dtype(config.dtype, config=self._config) + if config.generation_config is None: + self.generation_parameters = config.generation_parameters + self.generation_config_dict = self.generation_parameters.to_transformers_dict() + else: + self.generation_config_dict = config.generation_config.to_dict() if is_accelerate_available(): model_size, _ = calculate_maximum_sizes(self.model) @@ -391,7 +425,9 @@ def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool, ) return model_parallel, max_mem_this_process, device_map - def _create_auto_model(self, config: BaseModelConfig, env_config: EnvConfig) -> transformers.PreTrainedModel: + def _create_auto_model( + self, config: TransformersModelConfig, env_config: EnvConfig + ) -> transformers.PreTrainedModel: """ Creates an instance of the pretrained HF model. @@ -428,7 +464,7 @@ def _create_auto_model(self, config: BaseModelConfig, env_config: EnvConfig) -> return model def _create_auto_tokenizer( - self, config: BaseModelConfig, env_config: EnvConfig + self, config: TransformersModelConfig, env_config: EnvConfig ) -> transformers.PreTrainedTokenizer: return self._create_auto_tokenizer_with_name( model_name=config.pretrained, @@ -631,20 +667,30 @@ def greedy_until_multi_turn( # noqa: C901 ], ] ) - model_outputs = self.model.generate( - **model_inputs, - max_new_tokens=max_generated_tokens, - stopping_criteria=stopping_criteria, - do_sample=False, - pad_token_id=self.tokenizer.pad_token_id - if self.tokenizer.pad_token_id - else self.tokenizer.eos_token_id, + + generation_config = self.generation_config_dict.copy() + generation_config.update( + { + "max_new_tokens": max_generated_tokens, + "pad_token_id": self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id + else self.tokenizer.eos_token_id, + "eos_token_id": self.tokenizer.eos_token_id, + "do_sample": False, + } ) - model_outputs = model_outputs[0, model_inputs["input_ids"].size(1) :] - model_generations = [model_outputs] - decoded_generation = self.tokenizer.decode(model_outputs) + + model_outputs: GenerateOutput = self.model.generate( + **model_inputs, stopping_criteria=stopping_criteria, **generation_config + ) + model_outputs = model_outputs.sequences[0, model_inputs["input_ids"].size(1) :] + + # We manage stop tokens in an extra step in case they were incorrectly detected earlier + # (which can happen for multitoken stop sequences) + decoded_generation = self.tokenizer.decode(model_outputs) # should we skip_special_tokens=True here? for term in stop_tokens: decoded_generation = decoded_generation.split(term)[0] + model_generations = [model_outputs] input_tokens = [model_inputs["input_ids"]] @@ -671,21 +717,29 @@ def greedy_until_multi_turn( # noqa: C901 ] ) - model_outputs = self.model.generate( + generation_config = self.generation_config_dict.copy() + generation_config.update( + { + "max_new_tokens": max_generated_tokens, + "pad_token_id": self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id + else self.tokenizer.eos_token_id, + "eos_token_id": self.tokenizer.eos_token_id, + "do_sample": False, + } + ) + + model_outputs: GenerateOutput = self.model.generate( input_ids=model_inputs["input_ids"], attention_mask=model_inputs["attention_mask"], - max_new_tokens=max_generated_tokens, stopping_criteria=stopping_criteria, - do_sample=False, - pad_token_id=self.tokenizer.pad_token_id - if self.tokenizer.pad_token_id - else self.tokenizer.eos_token_id, + **generation_config, ) - model_outputs = model_outputs[0, model_inputs["input_ids"].size(1) :] + model_outputs = model_outputs.sequences[0, model_inputs["input_ids"].size(1) :] model_generations.append(model_outputs) - decoded_generation = self.tokenizer.decode(model_outputs, skip_special_tokens=True) input_tokens.append(model_inputs["input_ids"]) + decoded_generation = self.tokenizer.decode(model_outputs, skip_special_tokens=True) for term in stop_tokens: decoded_generation = decoded_generation.split(term)[0] @@ -708,7 +762,7 @@ def greedy_until_multi_turn( # noqa: C901 results.append( GenerativeMultiturnResponse( result=answers, - input_tokens=[], + input_tokens=input_tokens, generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0, @@ -860,21 +914,24 @@ def _generate( stopping_criteria = stop_sequences_criteria(self.tokenizer, stop_sequences=stop_tokens, batch=batch) batch_size, _ = batch.input_ids.shape - # Compute model generation - outputs = self.model.generate( - input_ids=batch.input_ids, - attention_mask=batch.input_mask, + generation_config = self.generation_config_dict.copy() + generation_config.update( max_new_tokens=max_new_tokens, - stopping_criteria=stopping_criteria, pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id, - return_dict_in_generate=True, - output_scores=True, eos_token_id=self.tokenizer.eos_token_id, do_sample=do_sample, num_return_sequences=num_samples, + output_logits=returns_logits, + renormalize_logits=True, + ) + + # Compute model generation + outputs: GenerateOutput = self.model.generate( + input_ids=batch.input_ids, + attention_mask=batch.input_mask, + stopping_criteria=stopping_criteria, + **generation_config, ) - if returns_logits: - logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True) generations = outputs.sequences[:, batch.input_ids.size(1) :] generations = torch.reshape(generations, (batch_size, num_samples, -1)) generations, len_gens = self.pad_and_gather(generations, num_samples=num_samples) @@ -882,7 +939,7 @@ def _generate( logits, len_logits = None, None if returns_logits: - logits, len_logits = self.pad_and_gather(logits) + logits, len_logits = self.pad_and_gather(outputs.logits) logits = logits.cpu().numpy() # We gather remaining info @@ -1297,6 +1354,16 @@ def _loglikelihood_single_token( return dataset.get_original_order(res) +class BaseModel(TransformersModel): + def __post_init__(self): + super().__post_init__() + + warnings.warn( + "Careful, the BaseModel name is deprecated and will be removed, you should use TransformersModel instead!", + FutureWarning, + ) + + class MultiTokenEOSCriteria(transformers.StoppingCriteria): """Criteria to stop on the specified multi-token sequence.""" diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 206fd3a55..3398f7218 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -32,6 +32,7 @@ from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset from lighteval.models.abstract_model import LightevalModel, ModelInfo +from lighteval.models.model_input import GenerationParameters from lighteval.models.model_output import ( GenerativeResponse, LoglikelihoodResponse, @@ -91,9 +92,13 @@ class VLLMModelConfig: True # whether to add a space at the start of each continuation in multichoice generation ) pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together. + generation_parameters: GenerationParameters = None # sampling parameters to use for generation subfolder: Optional[str] = None - temperature: float = 0.6 # will be used for multi sampling tasks, for tasks requiring no sampling, this will be ignored and set to 0. + + def __post_init__(self): + if not self.generation_parameters: + self.generation_parameters = GenerationParameters() class VLLMModel(LightevalModel): @@ -123,6 +128,7 @@ def __init__( self.precision = _get_dtype(config.dtype, config=self._config) self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) + self.sampling_params = SamplingParams(**config.generation_parameters.to_vllm_openai_dict()) self.pairwise_tokenization = config.pairwise_tokenization @property @@ -306,16 +312,18 @@ def _generate( generate: bool = True, ) -> list[GenerativeResponse]: """Contains the actual logic of the generation.""" + sampling_params = self.sampling_params.clone() or SamplingParams() if generate: - sampling_params = SamplingParams( - temperature=float(self._config.temperature) if num_samples > 1 else 0.0, - n=num_samples, - max_tokens=max_new_tokens, - stop=stop_tokens, - logprobs=1 if returns_logits else 0, - ) + sampling_params.n = num_samples + sampling_params.max_tokens = max_new_tokens + sampling_params.stop = stop_tokens + sampling_params.logprobs = 1 if returns_logits else 0 + else: - sampling_params = SamplingParams(temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False) + sampling_params.temperature = 0 + sampling_params.prompt_logprobs = 1 + sampling_params.max_tokens = 1 + sampling_params.detokenize = False if self.data_parallel_size > 1: # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index facecd8ec..6a40d2801 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -33,7 +33,7 @@ from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.metrics.utils.metric_utils import MetricCategory -from lighteval.models.model_loader import BaseModel, load_model +from lighteval.models.model_loader import TransformersModel, load_model from lighteval.models.model_output import ModelResponse from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks from lighteval.tasks.registry import Registry, taskinfo_selector @@ -180,10 +180,10 @@ def _init_model(self, model_config, model): ) else: return load_model(config=model_config, env_config=self.pipeline_parameters.env_config) - if isinstance(model, BaseModel): + if isinstance(model, TransformersModel): return model else: - return BaseModel.from_model( + return TransformersModel.from_model( model=model, use_chat_template=self.pipeline_parameters.use_chat_template, env_config=self.pipeline_parameters.env_config, diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index b2f26850d..09886e4db 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -41,7 +41,7 @@ apply_target_perplexity_metric, ) from lighteval.metrics.metrics import Metric, MetricCategory, Metrics -from lighteval.models.transformers.base_model import BaseModel +from lighteval.models.transformers.transformers_model import TransformersModel from lighteval.tasks.prompt_manager import PromptManager from lighteval.tasks.requests import ( Doc, @@ -578,7 +578,7 @@ def create_requests_from_tasks( # noqa: C901 task_dict: dict[str, LightevalTask], fewshot_dict: dict[str, list[Tuple[int, bool]]], num_fewshot_seeds: int, - lm: BaseModel, + lm: TransformersModel, max_samples: int | None, evaluation_tracker: "EvaluationTracker", use_chat_template: bool, @@ -594,7 +594,7 @@ def create_requests_from_tasks( # noqa: C901 fewshot_dict (dict[str, list[Tuple[int, bool]]]): A dictionary of few shot examples. num_fewshot_seeds (int): number of few shot seeds. - lm (BaseModel): language model class that will be used to eventually + lm (TransformersModel): language model class that will be used to eventually truncate the few shot examples (we need the maximum input size of the model) max_samples (int): maximum number of samples. diff --git a/tests/models/endpoints/test_tgi_model.py b/tests/models/endpoints/test_tgi_model.py index 305034278..40bae2ceb 100644 --- a/tests/models/endpoints/test_tgi_model.py +++ b/tests/models/endpoints/test_tgi_model.py @@ -33,7 +33,27 @@ class TestTGIModelConfig: [ ( "examples/model_configs/tgi_model.yaml", - {"inference_server_address": "", "inference_server_auth": None, "model_id": None}, + { + "inference_server_address": "", + "inference_server_auth": None, + "model_id": None, + "generation_parameters": { + "early_stopping": None, + "frequency_penalty": None, + "length_penalty": None, + "max_new_tokens": None, + "min_new_tokens": None, + "min_p": None, + "presence_penalty": None, + "repetition_penalty": None, + "seed": None, + "stop_tokens": None, + "temperature": None, + "top_k": None, + "top_p": None, + "truncate_prompt": None, + }, + }, ), ], ) diff --git a/tests/models/test_base_model.py b/tests/models/test_base_model.py index 4f26d2924..faf9c5755 100644 --- a/tests/models/test_base_model.py +++ b/tests/models/test_base_model.py @@ -21,13 +21,13 @@ # SOFTWARE. from lighteval.models.model_loader import load_model -from lighteval.models.transformers.base_model import BaseModel, BaseModelConfig +from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig from lighteval.utils.utils import EnvConfig def test_empty_requests(): - model_config = BaseModelConfig("hf-internal-testing/tiny-random-LlamaForCausalLM") - model: BaseModel = load_model(config=model_config, env_config=EnvConfig(cache_dir=".")) + model_config = TransformersModelConfig("hf-internal-testing/tiny-random-LlamaForCausalLM") + model: TransformersModel = load_model(config=model_config, env_config=EnvConfig(cache_dir=".")) assert model.loglikelihood([]) == [] assert model.loglikelihood_single_token([]) == []