From e770039d2fe0b992a18078b1b99eba067ba5d0dc Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Sat, 4 Jan 2025 11:25:13 +0100 Subject: [PATCH 1/4] Made litellm judge backend more robust. --- src/lighteval/metrics/llm_as_judge.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index e98a64aa4..bdfac5267 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -28,6 +28,7 @@ from tqdm import tqdm +from lighteval.models.model_output import ModelResponse from lighteval.utils.imports import is_litellm_available, is_openai_available, is_vllm_available @@ -195,20 +196,28 @@ def __call_litellm(self, prompts): def __call_api(prompt): for _ in range(self.API_MAX_RETRY): try: - response = litellm.completion( - model=self.model, - messages=prompt, - response_format={"type": "text"}, - max_tokens=512, - n=1, - caching=True, - ) + kwargs = { + "model": self.model, + "messages": prompt, + "response_format": {"type": "text"}, + "max_tokens": 512, + "n": 1, + "caching": True, + } + response = litellm.completion(**kwargs) text = response.choices[0].message.content + if text is None: + kwargs["caching"] = False + response = litellm.completion(**kwargs) + text = response.choices[0].message.content + if text is None: + # Just return an error response if the second attempt fails too + return ModelResponse(text="Failed to get response from the API.", model=self.model) return text except Exception as e: logger.warning(f"{type(e), e}") time.sleep(self.API_RETRY_SLEEP) - raise Exception("Failed to get response from the API") + return ModelResponse(text="Failed to get response from the API.", model=self.model) results = [] with ThreadPoolExecutor(100) as executor: From 01803d30b35ad3245e11d1c06a9efa0451f2444b Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Mon, 6 Jan 2025 17:48:52 -0800 Subject: [PATCH 2/4] Added failed flag to ModelResponse. --- src/lighteval/metrics/llm_as_judge.py | 10 ++++++---- src/lighteval/models/model_output.py | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index bdfac5267..81e1d7d30 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -206,18 +206,20 @@ def __call_api(prompt): } response = litellm.completion(**kwargs) text = response.choices[0].message.content - if text is None: + if not text or response.failed: kwargs["caching"] = False response = litellm.completion(**kwargs) text = response.choices[0].message.content - if text is None: + if not text or response.failed: # Just return an error response if the second attempt fails too - return ModelResponse(text="Failed to get response from the API.", model=self.model) + return ModelResponse( + text="Failed to get response from the API.", model=self.model, failed=True + ) return text except Exception as e: logger.warning(f"{type(e), e}") time.sleep(self.API_RETRY_SLEEP) - return ModelResponse(text="Failed to get response from the API.", model=self.model) + return ModelResponse(text="Failed to get response from the API.", model=self.model, failed=True) results = [] with ThreadPoolExecutor(100) as executor: diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py index 7d0ba4818..b485371ca 100644 --- a/src/lighteval/models/model_output.py +++ b/src/lighteval/models/model_output.py @@ -33,6 +33,7 @@ class ModelResponse: generated_tokens: list[int] = field(default_factory=list) # model generations truncated_tokens_count: Optional[int] = 0 # How many tokens truncated padded_tokens_count: Optional[int] = 0 # How many tokens of padding + failed: bool = False def get_result_for_eval(self): raise NotImplementedError() From 889a9cdc2fdce81b2b564795bb0bca13a834c82d Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Tue, 7 Jan 2025 06:37:40 -0800 Subject: [PATCH 3/4] Fixed wrong model response. --- src/lighteval/metrics/llm_as_judge.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 81e1d7d30..897d9afec 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -212,14 +212,12 @@ def __call_api(prompt): text = response.choices[0].message.content if not text or response.failed: # Just return an error response if the second attempt fails too - return ModelResponse( - text="Failed to get response from the API.", model=self.model, failed=True - ) + return ModelResponse(result="Failed to get response from the API.", failed=True) return text except Exception as e: logger.warning(f"{type(e), e}") time.sleep(self.API_RETRY_SLEEP) - return ModelResponse(text="Failed to get response from the API.", model=self.model, failed=True) + return ModelResponse(result="Failed to get response from the API.", failed=True) results = [] with ThreadPoolExecutor(100) as executor: From 43c51317f0f5a106f1d78e01ee9550f55d089633 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Tue, 7 Jan 2025 07:17:28 -0800 Subject: [PATCH 4/4] Removed model response and replaced with string. --- src/lighteval/metrics/llm_as_judge.py | 11 ++++++----- src/lighteval/models/model_output.py | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 897d9afec..23beda76f 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -28,7 +28,6 @@ from tqdm import tqdm -from lighteval.models.model_output import ModelResponse from lighteval.utils.imports import is_litellm_available, is_openai_available, is_vllm_available @@ -194,6 +193,7 @@ def __call_litellm(self, prompts): import litellm def __call_api(prompt): + error_message = "ERROR: Failed to get response from the API." for _ in range(self.API_MAX_RETRY): try: kwargs = { @@ -206,18 +206,19 @@ def __call_api(prompt): } response = litellm.completion(**kwargs) text = response.choices[0].message.content - if not text or response.failed: + if not text or text == error_message: kwargs["caching"] = False response = litellm.completion(**kwargs) text = response.choices[0].message.content - if not text or response.failed: + if not text or text == error_message: # Just return an error response if the second attempt fails too - return ModelResponse(result="Failed to get response from the API.", failed=True) + logger.error(f"Failed to get response from the API for prompt: {prompt}") + return error_message return text except Exception as e: logger.warning(f"{type(e), e}") time.sleep(self.API_RETRY_SLEEP) - return ModelResponse(result="Failed to get response from the API.", failed=True) + return error_message results = [] with ThreadPoolExecutor(100) as executor: diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py index b485371ca..7d0ba4818 100644 --- a/src/lighteval/models/model_output.py +++ b/src/lighteval/models/model_output.py @@ -33,7 +33,6 @@ class ModelResponse: generated_tokens: list[int] = field(default_factory=list) # model generations truncated_tokens_count: Optional[int] = 0 # How many tokens truncated padded_tokens_count: Optional[int] = 0 # How many tokens of padding - failed: bool = False def get_result_for_eval(self): raise NotImplementedError()