Skip to content

Commit

Permalink
More docs + manifest.in fix (#145)
Browse files Browse the repository at this point in the history
Signed-off-by: Igor Gitman <[email protected]>
  • Loading branch information
Kipok authored Oct 3, 2024
1 parent 266b0cb commit 67f50c5
Show file tree
Hide file tree
Showing 9 changed files with 43 additions and 10 deletions.
4 changes: 2 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
include nemo_skills/prompt/template/**/*.yaml
include nemo_skills/prompt/config/**/*.yaml
recursive-include nemo_skills/prompt/template *.yaml
recursive-include nemo_skills/prompt/config *.yaml
10 changes: 9 additions & 1 deletion dataset_explorer_demo/visualize_similar.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,15 @@ def load_test_sets(test_set):
with gr.Blocks() as demo:
gr.Markdown("# OpenMathInstruct-2 test set contamination explorer")
gr.Markdown(
"See our full dataset at HuggingFace: [OpenMathInstruct-2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)"
"During construction of OpenMathInstruct-2 we generated many synthetic problems. "
"We did a very thorough decontamination to remove exact duplicates (including rephrases) with popular benchmarks.<br>"
"Still our dataset contains many questions that are very similar to test sets. "
"To make things more transparent we created this demo, that you can use to explore "
"most similar questions from our data for each of the test set problems.<br>"
"We also provide closest examples from MATH training set, since it was used as seed data "
"to create our dataset and in most cases that training set already contains very similar questions to the test sets!<br>"
"See our full dataset at HuggingFace: [OpenMathInstruct-2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)<br>"
"And read our [paper](https://arxiv.org/abs/2410.01560) to learn more about the decontamination process and how we retrieve similar questions."
)

warning_box = gr.Markdown(visible=False)
Expand Down
7 changes: 6 additions & 1 deletion docs/reproducing-results.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,4 +179,9 @@ Coming in a few days!

## Model training

Coming in a few days!
Coming in a few days!


## Dataset contamination explorer

To reproduce our dataset contamination explorer demo refer to [dataset_explorer_demo/README.md](/dataset_explorer_demo/README.md)
6 changes: 5 additions & 1 deletion nemo_skills/pipeline/check_contamination.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,11 @@ def check_contamination(
"If not specified, will be inside `ssh_tunnel.job_dir` part of your cluster config.",
),
):
"""Check contamination between train/test via an LLM call."""
"""Check contamination between train/test via an LLM call.
Run `python -m nemo_skills.inference.check_contamination --help` for other supported arguments
(need to be prefixed with ++, since we use Hydra for that script).
"""
setup_logging(disable_hydra_logs=False)
extra_arguments = f'{" ".join(ctx.args)}'

Expand Down
5 changes: 4 additions & 1 deletion nemo_skills/pipeline/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,10 @@ def convert(
config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
):
"""Convert a checkpoint from one format to another."""
"""Convert a checkpoint from one format to another.
All extra arguments are passed directly to the underlying conversion script (see their docs).
"""
setup_logging(disable_hydra_logs=False)
extra_arguments = f'{" ".join(ctx.args)}'
LOG.info("Starting conversion job")
Expand Down
3 changes: 2 additions & 1 deletion nemo_skills/pipeline/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def eval(
):
"""Evaluate a model on specified benchmarks.
Any extra arguments will be directly passed to nemo_skills.inference.generate
Run `python -m nemo_skills.inference.generate --help` for other supported arguments
(need to be prefixed with ++, since we use Hydra for that script).
"""
setup_logging(disable_hydra_logs=False)
extra_arguments = f'{" ".join(ctx.args)}'
Expand Down
6 changes: 5 additions & 1 deletion nemo_skills/pipeline/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,11 @@ def generate(
config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
):
"""Generate LLM completions for a given input file."""
"""Generate LLM completions for a given input file.
Run `python -m nemo_skills.inference.generate --help` for other supported arguments
(need to be prefixed with ++, since we use Hydra for that script).
"""
setup_logging(disable_hydra_logs=False)
extra_arguments = f'{" ".join(ctx.args)}'

Expand Down
6 changes: 5 additions & 1 deletion nemo_skills/pipeline/llm_math_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@ def llm_math_judge(
config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
):
"""Judge LLM math outputs using another LLM."""
"""Judge LLM math outputs using another LLM.
Run `python -m nemo_skills.inference.llm_math_judge --help` for other supported arguments
(need to be prefixed with ++, since we use Hydra for that script).
"""
setup_logging(disable_hydra_logs=False)
extra_arguments = f'{" ".join(ctx.args)}'
LOG.info("Starting LLM math judge job")
Expand Down
6 changes: 5 additions & 1 deletion nemo_skills/pipeline/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,11 @@ def train(
config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
):
"""Train (SFT or DPO) an LLM model."""
"""Train (SFT or DPO) an LLM model.
All extra arguments are passed directly to the training script
(need to be prefixed with ++, since NeMo uses Hydra).
"""
setup_logging(disable_hydra_logs=False)
extra_arguments = f'{" ".join(ctx.args)}'
LOG.info("Starting training job")
Expand Down

0 comments on commit 67f50c5

Please sign in to comment.