diff --git a/utilization/dataset/agieval_cot.py b/utilization/dataset/agieval_cot.py index 50d31f71..3bdea308 100644 --- a/utilization/dataset/agieval_cot.py +++ b/utilization/dataset/agieval_cot.py @@ -65,7 +65,10 @@ class Agieval_cot(GenerationDataset): supported_cot = ["base"] def init_arguments(self): - self.extra_model_args = dict(stop=["\n"]) if self.cot is None else dict() + if self.cot is None: + # when using chain-of-thought, responses might be in multiple lines + self.extra_model_args["stop"] = ["\n"] + text = "" text += "gen" if self.subset_name in AGIEVAL_NO_LETTER_CHOICE_TASKS else "mcq" text += "_zh" if self.subset_name in AGIEVAL_ZH_PROMPT_TASKS else "_en" diff --git a/utilization/dataset/bbh.py b/utilization/dataset/bbh.py index 7ed7228a..df42bdd1 100644 --- a/utilization/dataset/bbh.py +++ b/utilization/dataset/bbh.py @@ -69,7 +69,9 @@ class Bbh(GenerationDataset): def init_arguments(self): self.bbh_instruction = BBH_PROMPTS[self.subset_name] - self.extra_model_args = dict(stop=["\n"]) if self.cot is None else dict() + if self.cot is None: + # when using chain-of-thought, responses might be in multiple lines + self.extra_model_args["stop"] = ["\n"] def format_instance(self, instance): target = instance["answer"] diff --git a/utilization/dataset/gaokao.py b/utilization/dataset/gaokao.py index a1cc17a3..315fd09b 100644 --- a/utilization/dataset/gaokao.py +++ b/utilization/dataset/gaokao.py @@ -64,7 +64,8 @@ class Gaokao(GenerationDataset): def init_arguments(self): self.gaokao_instruction = GAOKAO_PROMPTS[self.subset_name] - self.extra_model_args = dict(temperature=0.3, max_tokens=4096) + self.extra_model_args["temperature"] = 0.3 + self.extra_model_args["max_tokens"] = 4096 # According to https://github.com/OpenLMLab/GAOKAO-Bench/blob/main/Models/openai_gpt4.py # We use temperature=0.3 and max_tokens=4096 diff --git a/utilization/dataset/gsm8k.py b/utilization/dataset/gsm8k.py index c4267b4c..2856d652 100644 --- a/utilization/dataset/gsm8k.py +++ b/utilization/dataset/gsm8k.py @@ -28,8 +28,9 @@ class Gsm8k(GenerationDataset): _extract_numbers = re.compile(r"[-+]?\d*\.\d+|\d+") def init_arguments(self): - if self.model_type == 'base': - self.extra_model_args['stop'] = ['\n'] + if self.cot is None: + # when using chain-of-thought, responses might be in multiple lines + self.extra_model_args["stop"] = ["\n"] def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_set): super().load_raw_dataset(dataset_path, subset_name, evaluation_set, example_set) @@ -39,7 +40,7 @@ def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_se self.example_data = LEAST_TO_MOST_EXAMPLARS elif self.cot == 'pal': self.example_data = PAL_EXAMPLARS - self.instruction = "Let's use python to solve math problems. Here are some examples how to do it." + self.instruction = "Let's use python to solve math problems. Here are some examples how to do it.\n\nQuestion: {{question.replace('\n', ' ')}}\nAnswer:" def post_processing(self, predictions): new_predictions = [] @@ -74,7 +75,7 @@ def post_processing(self, predictions): def format_instance(self, instance): # remove decimal seperators - instance["answer"] = ' ' + self._decimal_separator.sub(r"\1\2", instance["answer"]) + instance["answer"] = ' ' + self._decimal_separator.sub("", instance["answer"]) # few-shot examples might not contain "####" if "####" in instance["answer"]: diff --git a/utilization/dataset/icl_strategies.py b/utilization/dataset/icl_strategies.py index 03d06f46..f9e673fc 100644 --- a/utilization/dataset/icl_strategies.py +++ b/utilization/dataset/icl_strategies.py @@ -75,7 +75,7 @@ def global_entropy_ordering_strategy(indices, labels, example_dataset, call_mode return list(best_perm) -def ape(example_dataset, eval_dataset, call_model, api_key): +def ape(example_dataset, eval_dataset, call_model): """ generate instructions using APE @@ -87,6 +87,8 @@ def ape(example_dataset, eval_dataset, call_model, api_key): List[str]: results of likelihood evaluation List[float]: scores based on log probability """ + import openai + api_key = openai.api_key class ModelArguments: diff --git a/utilization/dataset/math.py b/utilization/dataset/math.py index 6cd48d30..192cfe0c 100644 --- a/utilization/dataset/math.py +++ b/utilization/dataset/math.py @@ -38,7 +38,8 @@ class Math(GenerationDataset): def init_arguments(self): if self.model_type == 'base': - self.extra_model_args['stop'] = ['\n\n'] + # when evaluating base model, responses might be in multiple lines + self.extra_model_args.get("stop", []).append("\n\n") @staticmethod def normalize_final_answer(final_answer: str) -> str: