diff --git a/utilization/dataset/agieval_cot.py b/utilization/dataset/agieval_cot.py
index 50d31f71..3bdea308 100644
--- a/utilization/dataset/agieval_cot.py
+++ b/utilization/dataset/agieval_cot.py
@@ -65,7 +65,10 @@ class Agieval_cot(GenerationDataset):
     supported_cot = ["base"]
 
     def init_arguments(self):
-        self.extra_model_args = dict(stop=["\n"]) if self.cot is None else dict()
+        if self.cot is None:
+            # when using chain-of-thought, responses might be in multiple lines
+            self.extra_model_args["stop"] = ["\n"]
+
         text = ""
         text += "gen" if self.subset_name in AGIEVAL_NO_LETTER_CHOICE_TASKS else "mcq"
         text += "_zh" if self.subset_name in AGIEVAL_ZH_PROMPT_TASKS else "_en"
diff --git a/utilization/dataset/bbh.py b/utilization/dataset/bbh.py
index 7ed7228a..df42bdd1 100644
--- a/utilization/dataset/bbh.py
+++ b/utilization/dataset/bbh.py
@@ -69,7 +69,9 @@ class Bbh(GenerationDataset):
 
     def init_arguments(self):
         self.bbh_instruction = BBH_PROMPTS[self.subset_name]
-        self.extra_model_args = dict(stop=["\n"]) if self.cot is None else dict()
+        if self.cot is None:
+            # when using chain-of-thought, responses might be in multiple lines
+            self.extra_model_args["stop"] = ["\n"]
 
     def format_instance(self, instance):
         target = instance["answer"]
diff --git a/utilization/dataset/gaokao.py b/utilization/dataset/gaokao.py
index a1cc17a3..315fd09b 100644
--- a/utilization/dataset/gaokao.py
+++ b/utilization/dataset/gaokao.py
@@ -64,7 +64,8 @@ class Gaokao(GenerationDataset):
 
     def init_arguments(self):
         self.gaokao_instruction = GAOKAO_PROMPTS[self.subset_name]
-        self.extra_model_args = dict(temperature=0.3, max_tokens=4096)
+        self.extra_model_args["temperature"] = 0.3
+        self.extra_model_args["max_tokens"] = 4096
         # According to https://github.com/OpenLMLab/GAOKAO-Bench/blob/main/Models/openai_gpt4.py
         # We use temperature=0.3 and max_tokens=4096
 
diff --git a/utilization/dataset/gsm8k.py b/utilization/dataset/gsm8k.py
index c4267b4c..2856d652 100644
--- a/utilization/dataset/gsm8k.py
+++ b/utilization/dataset/gsm8k.py
@@ -28,8 +28,9 @@ class Gsm8k(GenerationDataset):
     _extract_numbers = re.compile(r"[-+]?\d*\.\d+|\d+")
 
     def init_arguments(self):
-        if self.model_type == 'base':
-            self.extra_model_args['stop'] = ['\n']
+        if self.cot is None:
+            # when using chain-of-thought, responses might be in multiple lines
+            self.extra_model_args["stop"] = ["\n"]
 
     def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_set):
         super().load_raw_dataset(dataset_path, subset_name, evaluation_set, example_set)
@@ -39,7 +40,7 @@ def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_se
             self.example_data = LEAST_TO_MOST_EXAMPLARS
         elif self.cot == 'pal':
             self.example_data = PAL_EXAMPLARS
-            self.instruction = "Let's use python to solve math problems. Here are some examples how to do it."
+            self.instruction = "Let's use python to solve math problems. Here are some examples how to do it.\n\nQuestion: {{question.replace('\n', ' ')}}\nAnswer:"
 
     def post_processing(self, predictions):
         new_predictions = []
@@ -74,7 +75,7 @@ def post_processing(self, predictions):
     def format_instance(self, instance):
 
         # remove decimal seperators
-        instance["answer"] = ' ' + self._decimal_separator.sub(r"\1\2", instance["answer"])
+        instance["answer"] = ' ' + self._decimal_separator.sub("", instance["answer"])
 
         # few-shot examples might not contain "####"
         if "####" in instance["answer"]:
diff --git a/utilization/dataset/icl_strategies.py b/utilization/dataset/icl_strategies.py
index 03d06f46..f9e673fc 100644
--- a/utilization/dataset/icl_strategies.py
+++ b/utilization/dataset/icl_strategies.py
@@ -75,7 +75,7 @@ def global_entropy_ordering_strategy(indices, labels, example_dataset, call_mode
     return list(best_perm)
 
 
-def ape(example_dataset, eval_dataset, call_model, api_key):
+def ape(example_dataset, eval_dataset, call_model):
     """
     generate instructions using APE
 
@@ -87,6 +87,8 @@ def ape(example_dataset, eval_dataset, call_model, api_key):
         List[str]: results of likelihood evaluation
         List[float]: scores based on log probability
     """
+    import openai
+    api_key = openai.api_key
 
     class ModelArguments:
 
diff --git a/utilization/dataset/math.py b/utilization/dataset/math.py
index 6cd48d30..192cfe0c 100644
--- a/utilization/dataset/math.py
+++ b/utilization/dataset/math.py
@@ -38,7 +38,8 @@ class Math(GenerationDataset):
 
     def init_arguments(self):
         if self.model_type == 'base':
-            self.extra_model_args['stop'] = ['\n\n']
+            # when evaluating base model, responses might be in multiple lines
+            self.extra_model_args.get("stop", []).append("\n\n")
 
     @staticmethod
     def normalize_final_answer(final_answer: str) -> str: