From c05569579a10d41e5e97e7a381107df19a37fd7c Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Tue, 21 Jan 2025 22:40:07 +0100 Subject: [PATCH] Add tasks for benchmark of french models --- community_tasks/french_evals.py | 255 ++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 community_tasks/french_evals.py diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py new file mode 100644 index 000000000..5bd76a245 --- /dev/null +++ b/community_tasks/french_evals.py @@ -0,0 +1,255 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# ruff: noqa: F405, F403, F401 +""" +Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task. + +This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. + +This module implements tasks for the french specific datasets +See : https://huggingface.co/fr-gouv-coordination-ia +""" + +import random + +import numpy as np +from aenum import extend_enum + +import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry +from lighteval.metrics.metrics import Metrics, SampleLevelMetric +from lighteval.metrics.utils.metric_utils import ( + MetricCategory, + MetricUseCase, + SampleLevelMetricGrouping, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +# DEFINE YOUR PROMPT FUNCTIONS +# Define as many as you need for your different tasks +# Ifeval-fr prompt function +def prompt_ifeval_fr(line, task_name: str = None): + """Defines how to go from a dataset line to a doc object. + Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info + about what this function should do in the README. + """ + return Doc( + task_name=task_name, + query=line["prompt"], + choices=[""], + gold_index=0, + instruction="", + specific={"instructions_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]}, + ) + + +# qpqa-fr prompt function + + +LETTER_INDICES = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", +] + + +def prompt_gpqa_fr(line, task_name: str = None): + """Defines how to go from a dataset line to a doc object. + Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info + about what this function should do in the README. + """ + gold_index = random.randint(0, 3) + choices = [line["Réponse incorrecte 1"], line["Réponse incorrecte 2"], line["Réponse incorrecte 3"]] + choices.insert(gold_index, line["Réponse correcte"]) + + instruction = "Choisissez la réponse correcte aux questions suivantes.\n\n" + + query = f"Question: {line['Question']}\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)]) + query += "Answer: " + return Doc( + task_name=task_name, + query=f"{instruction}{query}", + choices=LETTER_INDICES[: len(choices)], + gold_index=gold_index, + instruction=instruction, + ) + + +submetric_names = [ + "prompt_level_strict_acc", + "inst_level_strict_acc", + "prompt_level_loose_acc", + "inst_level_loose_acc", +] + + +def ifeval_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict: + response = predictions[0] + + # Strict instructions + instruction_list = formatted_doc.specific["instructions_id_list"] + all_kwargs = formatted_doc.specific["kwargs"] + prompt = formatted_doc.query + + # Loose instructions + r = response.split("\n") + response_remove_first = "\n".join(r[1:]).strip() + response_remove_last = "\n".join(r[:-1]).strip() + response_remove_both = "\n".join(r[1:-1]).strip() + revised_response = response.replace("*", "") + revised_response_remove_first = response_remove_first.replace("*", "") + revised_response_remove_last = response_remove_last.replace("*", "") + revised_response_remove_both = response_remove_both.replace("*", "") + all_responses = [ + response, + revised_response, + response_remove_first, + response_remove_last, + response_remove_both, + revised_response_remove_first, + revised_response_remove_last, + revised_response_remove_both, + ] + + is_following_list_strict = [] + is_following_list_loose = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + task_kwargs = {k: v for k, v in all_kwargs[index].items() if v} + instruction.build_description(**task_kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=prompt) + + # Strict + if response.strip() and instruction.check_following(response): + is_following_list_strict.append(True) + else: + is_following_list_strict.append(False) + + # Loose + is_following = False + for r in all_responses: + if r.strip() and instruction.check_following(r): + is_following = True + break + + is_following_list_loose.append(is_following) + + return { + "prompt_level_strict_acc": int(all(is_following_list_strict)), + "inst_level_strict_acc": is_following_list_strict, + "prompt_level_loose_acc": int(all(is_following_list_loose)), + "inst_level_loose_acc": is_following_list_loose, + } + + +def agg_inst_level_acc(items): + flat_items = [item for sublist in items for item in sublist] + inst_level_acc = sum(flat_items) / len(flat_items) + return inst_level_acc + + +ifeval_metrics = SampleLevelMetricGrouping( + metric_name=submetric_names, + higher_is_better={n: True for n in submetric_names}, + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.ACCURACY, + sample_level_fn=ifeval_metric, + corpus_level_fn={ + "prompt_level_strict_acc": np.mean, + "inst_level_strict_acc": agg_inst_level_acc, + "prompt_level_loose_acc": np.mean, + "inst_level_loose_acc": agg_inst_level_acc, + }, +) + +# EVAL WITH NO SUBSET ## +# This is how you create a simple task (like hellaswag) which has one single subset +# attached to it, and one evaluation possible. +# IFEVal-fr task +ifeval_fr_task = LightevalTaskConfig( + name="ifeval-fr", + prompt_function=prompt_ifeval_fr, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + suite=["community"], + hf_repo="fr-gouv-coordination-ia/IFEval-fr", + hf_subset="default", + metric=[ifeval_metrics], + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=1280, + stop_sequence=[], # no stop sequence, will use eot token + version="0.1", # select your metric in Metrics +) + +# GPQA-fr task +gpqa_fr_task = LightevalTaskConfig( + name="gpqa", + suite=["lighteval"], + prompt_function=prompt_gpqa_fr, + hf_repo="fr-gouv-coordination-ia/gpqa-fr", + hf_subset="gpqa_main", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=[Metrics.loglikelihood_acc_single_token], + stop_sequence=["\n"], + trust_dataset=True, + version=0, +) + +# STORE YOUR EVALS +TASKS_TABLE = [ifeval_fr_task, gpqa_fr_task]