Top answer processor (#378)

NVIDIA · Feb 12, 2025 · e2cc47a · e2cc47a
1 parent 3dc7c08
commit e2cc47a
Show file tree

Hide file tree

Showing 7 changed files with 412 additions and 200 deletions.
diff --git a/docs/openmathinstruct2/dataset.md b/docs/openmathinstruct2/dataset.md
@@ -168,34 +168,42 @@ from nemo_skills.pipeline import wrap_arguments
 from nemo_skills.pipeline.cli import run_cmd
 
 # for MATH
-data_folder = "/workspace/new-problems-solution-augmentation/math"
+input_folder = "/workspace/new-problems-solution-augmentation/math"
+output_folder = "/workspace/new-problems-solution-augmentation/math-fill-majority"
 # if you want to avoid scheduling many jobs, you can instead
 # create one big cmd and run it directly to handle all files
 # or you can create a new script and reference it with
 # /nemo_run/code/<path to your script inside this repo>
 for i in range(80):
     cmd = (
-        f'python -m nemo_skills.evaluation.fill_majority_answer '
-        f'    ++input_files="{data_folder}/problem-set{i}/generation/output-rs*.jsonl" '
+        f'python -m nemo_skills.evaluation.aggregate_answers '
+        f'    ++input_dir="{input_folder}" '
+        f'    ++input_files="problem-set{i}/generation/output-rs*.jsonl" '
+        f'    ++output_dir="{output_folder}" '
+        f'    ++mode=fill '
     )
     run_cmd(
         cluster="slurm",
         ctx=wrap_arguments(cmd),
-        log_dir=f'{data_folder}/problem-set{i}/fill-majority-logs'
+        log_dir=f'{output_folder}/problem-set{i}/aggregate-answer-logs'
         # if cluster has a cpu partition you can specify it with a `partition` parameter
     )
 
 # for GSM8K
-data_folder = "/workspace/new-problems-solution-augmentation/gsm8k"
+input_folder = "/workspace/new-problems-solution-augmentation/gsm8k"
+output_folder = "/workspace/new-problems-solution-augmentation/gsm8k-fill-majority"
 for i in range(10):
     cmd = (
-        f'python -m nemo_skills.evaluation.fill_majority_answer '
-        f'    ++input_files="{data_folder}/problem-set{i}/generation/output-rs*.jsonl" '
+        f'python -m nemo_skills.evaluation.aggregate_answers '
+        f'    ++input_dir="{input_folder}" '
+        f'    ++input_files="problem-set{i}/generation/output-rs*.jsonl" '
+        f'    ++output_dir="{output_folder}" '
+        f'    ++mode=fill '
     )
     run_cmd(
         cluster="slurm",
         ctx=wrap_arguments(cmd),
-        log_dir=f'{data_folder}/problem-set{i}/fill-majority-logs'
+        log_dir=f'{output_folder}/problem-set{i}/aggregate-answer-logs'
         # if cluster has a cpu partition you can specify it with a `partition` parameter
     )
 ```

diff --git a/docs/openmathinstruct2/evaluation.md b/docs/openmathinstruct2/evaluation.md
@@ -137,20 +137,15 @@ to see symbolic scores right away. You can evaluate with the judge by first crea
 answers. E.g. for "math" benchmark run
 
 ```bash
-python -m nemo_skills.evaluation.fill_majority_answer \
-    ++input_files="./openmath2-llama3.1-8b-eval/eval-results/math/output-rs*.jsonl" \
-    ++fill_key=predicted_answer
+python -m nemo_skills.evaluation.aggregate_answers \
+    ++input_dir="./openmath2-llama3.1-8b-eval/eval-results/math" \
+    ++input_files="output-rs*.jsonl" \
+    ++mode=extract \
+    ++output_dir="./openmath2-llama3.1-8b-eval/eval-results-majority/math"
 ```
 
-This will replace `predicted_answer` in all files with majority answer.
+This will output "./openmath2-llama3.1-8b-eval/eval-results-majority/math/output-agg.jsonl" file with majority answer. We can run the llm-judge pipeline on it.
 
-After that, let's copy just a single of those files into a new folder so that we can run the llm-judge pipeline
-on them.
-
-```bash
-mkdir -p ./openmath2-llama3.1-8b-eval/eval-results-majority/math
-cp ./openmath2-llama3.1-8b-eval/eval-results/math/output-rs0.jsonl ./openmath2-llama3.1-8b-eval/eval-results-majority/math/
-```
 
 Repeat the above steps for all benchmarks. Now we are ready to run the judge pipeline and summarize results
 after it is finished. You need to define `OPENAI_API_KEY` for the command below to work.