More docs + manifest.in fix (#145)

Signed-off-by: Igor Gitman <[email protected]>
NVIDIA · Oct 3, 2024 · 67f50c5 · 67f50c5
1 parent 266b0cb
commit 67f50c5
Show file tree

Hide file tree

Showing 9 changed files with 43 additions and 10 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,2 @@
-include nemo_skills/prompt/template/**/*.yaml
-include nemo_skills/prompt/config/**/*.yaml
+recursive-include nemo_skills/prompt/template *.yaml
+recursive-include nemo_skills/prompt/config *.yaml
diff --git a/dataset_explorer_demo/visualize_similar.py b/dataset_explorer_demo/visualize_similar.py
@@ -176,7 +176,15 @@ def load_test_sets(test_set):
 with gr.Blocks() as demo:
     gr.Markdown("# OpenMathInstruct-2 test set contamination explorer")
     gr.Markdown(
-        "See our full dataset at HuggingFace: [OpenMathInstruct-2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)"
+        "During construction of OpenMathInstruct-2 we generated many synthetic problems. "
+        "We did a very thorough decontamination to remove exact duplicates (including rephrases) with popular benchmarks.<br>"
+        "Still our dataset contains many questions that are very similar to test sets. "
+        "To make things more transparent we created this demo, that you can use to explore "
+        "most similar questions from our data for each of the test set problems.<br>"
+        "We also provide closest examples from MATH training set, since it was used as seed data "
+        "to create our dataset and in most cases that training set already contains very similar questions to the test sets!<br>"
+        "See our full dataset at HuggingFace: [OpenMathInstruct-2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)<br>"
+        "And read our [paper](https://arxiv.org/abs/2410.01560) to learn more about the decontamination process and how we retrieve similar questions."
     )
 
     warning_box = gr.Markdown(visible=False)

diff --git a/docs/reproducing-results.md b/docs/reproducing-results.md
@@ -179,4 +179,9 @@ Coming in a few days!
 
 ## Model training
 
-Coming in a few days!
+Coming in a few days!
+
+
+## Dataset contamination explorer
+
+To reproduce our dataset contamination explorer demo refer to [dataset_explorer_demo/README.md](/dataset_explorer_demo/README.md)
diff --git a/nemo_skills/pipeline/check_contamination.py b/nemo_skills/pipeline/check_contamination.py
@@ -74,7 +74,11 @@ def check_contamination(
         "If not specified, will be inside `ssh_tunnel.job_dir` part of your cluster config.",
     ),
 ):
-    """Check contamination between train/test via an LLM call."""
+    """Check contamination between train/test via an LLM call.
+
+    Run `python -m nemo_skills.inference.check_contamination --help` for other supported arguments
+    (need to be prefixed with ++, since we use Hydra for that script).
+    """
     setup_logging(disable_hydra_logs=False)
     extra_arguments = f'{" ".join(ctx.args)}'
 

diff --git a/nemo_skills/pipeline/convert.py b/nemo_skills/pipeline/convert.py
@@ -147,7 +147,10 @@ def convert(
     config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
     log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
 ):
-    """Convert a checkpoint from one format to another."""
+    """Convert a checkpoint from one format to another.
+
+    All extra arguments are passed directly to the underlying conversion script (see their docs).
+    """
     setup_logging(disable_hydra_logs=False)
     extra_arguments = f'{" ".join(ctx.args)}'
     LOG.info("Starting conversion job")

diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
@@ -99,7 +99,8 @@ def eval(
 ):
     """Evaluate a model on specified benchmarks.
 
-    Any extra arguments will be directly passed to nemo_skills.inference.generate
+    Run `python -m nemo_skills.inference.generate --help` for other supported arguments
+    (need to be prefixed with ++, since we use Hydra for that script).
     """
     setup_logging(disable_hydra_logs=False)
     extra_arguments = f'{" ".join(ctx.args)}'

diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py
@@ -91,7 +91,11 @@ def generate(
     config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
     log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
 ):
-    """Generate LLM completions for a given input file."""
+    """Generate LLM completions for a given input file.
+
+    Run `python -m nemo_skills.inference.generate --help` for other supported arguments
+    (need to be prefixed with ++, since we use Hydra for that script).
+    """
     setup_logging(disable_hydra_logs=False)
     extra_arguments = f'{" ".join(ctx.args)}'
 

diff --git a/nemo_skills/pipeline/llm_math_judge.py b/nemo_skills/pipeline/llm_math_judge.py
@@ -69,7 +69,11 @@ def llm_math_judge(
     config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
     log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
 ):
-    """Judge LLM math outputs using another LLM."""
+    """Judge LLM math outputs using another LLM.
+
+    Run `python -m nemo_skills.inference.llm_math_judge --help` for other supported arguments
+    (need to be prefixed with ++, since we use Hydra for that script).
+    """
     setup_logging(disable_hydra_logs=False)
     extra_arguments = f'{" ".join(ctx.args)}'
     LOG.info("Starting LLM math judge job")

diff --git a/nemo_skills/pipeline/train.py b/nemo_skills/pipeline/train.py
@@ -165,7 +165,11 @@ def train(
     config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
     log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
 ):
-    """Train (SFT or DPO) an LLM model."""
+    """Train (SFT or DPO) an LLM model.
+
+    All extra arguments are passed directly to the training script
+    (need to be prefixed with ++, since NeMo uses Hydra).
+    """
     setup_logging(disable_hydra_logs=False)
     extra_arguments = f'{" ".join(ctx.args)}'
     LOG.info("Starting training job")