GPU tests in CI (#70)

Signed-off-by: Igor Gitman <[email protected]>
NVIDIA · Jul 25, 2024 · 396b838 · 396b838
1 parent ae0a592
commit 396b838
Show file tree

Hide file tree

Showing 10 changed files with 144 additions and 77 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -3,13 +3,18 @@ name: Unit tests
 on:
   pull_request:
     branches: [ "main" ]
+    types: [ "labeled" ]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
 permissions:
   contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   unit-tests:
     runs-on: ubuntu-latest
@@ -34,7 +39,7 @@ jobs:
         export NEMO_SKILLS_SANDBOX_HOST="$(docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' `docker ps -a | grep local-sandbox  | awk '{print $1}'`)"
         echo $NEMO_SKILLS_SANDBOX_HOST
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
-        python -m pytest tests/ --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=nemo_skills --cov=pipeline --durations=30 -rs -vvv | tee pytest-coverage.txt
+        python -m pytest tests/ -m "not gpu" --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=nemo_skills --cov=pipeline --durations=30 -rs -vvv | tee pytest-coverage.txt
     - name: Check help message
       run: |
         NEMO_SKILLS_CONFIG=cluster_configs/slurm.yaml python tests/check_help.py --all
@@ -60,4 +65,34 @@ jobs:
         NEMO_SKILLS_CONFIG=cluster_configs/slurm.yaml python tests/check_help.py
     - name: Check datasets preparation
       run: |
-        NEMO_SKILLS_CONFIG=cluster_configs/slurm.yaml pytest tests/test_datasets.py
+        NEMO_SKILLS_CONFIG=cluster_configs/slurm.yaml pytest tests/test_datasets.py
+
+  gpu-tests:
+    runs-on: self-hosted-nemo-skills
+    if: ${{ github.event.label.name == 'run GPU tests' }}
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        path: ${{ github.run_id }}
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        cd ${{ github.run_id }}
+        python -m pip install --upgrade pip
+        pip install .
+        pip install -r datasets/requirements.txt
+        pip install pytest pytest-cov pytest-timeout
+        python datasets/prepare.py
+    - name: Run GPU tests
+      timeout-minutes: 120
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        cd ${{ github.run_id }}
+        nvidia-smi
+        export NEMO_SKILLS_CONFIG=cluster_configs/local.yaml
+        set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
+        ./tests/gpu-tests/run.sh /mnt/datadrive/nemo-skills-data/Meta-Llama-3-8B /mnt/datadrive/nemo-skills-data/Meta-Llama-3-8B-Instruct
diff --git a/cluster_configs/local.yaml b/cluster_configs/local.yaml
@@ -16,7 +16,7 @@ cluster: local
 
 containers:
   tensorrt_llm: igitman/nemo-skills-trtllm:0.3.0
-  vllm: igitman/nemo-skills-vllm:0.3.0
+  vllm: igitman/nemo-skills-vllm:0.3.1
   nemo: igitman/nemo-skills-sft:0.3.0
   # sandbox is always re-built locally
 

diff --git a/dockerfiles/Dockerfile.vllm b/dockerfiles/Dockerfile.vllm
@@ -12,36 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.10-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development
 # image (by specifying build target as `nemo-deps`)
 FROM ${BASE_IMAGE} as vllm-deps
 
-# Ensure apt-get won't prompt for selecting options
-ENV DEBIAN_FRONTEND=noninteractive
-# libavdevice-dev rerquired for latest torchaudio
-RUN apt-get update && \
-  apt-get upgrade -y && \
-  rm -rf /var/lib/apt/lists/*
-
 # install vllm dependencies
 WORKDIR /workspace/
 
-# Hack to force vllm to build even though logic for the check is flawed in their setup.py
-ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 9.0+PTX"
-
-# v0.4.2
-ARG VLLM_HASH=c7f2cf2b7f67bce5842fedfdba508440fe257375
+ARG VLLM_HASH=5448f67635570cee6fc23c7cd166e9d8f7595009
 
 RUN git clone https://github.com/vllm-project/vllm.git && \
   cd vllm && \
   git checkout ${VLLM_HASH} && \
-  pip3 install -e . && \
+  pip install -e . && \
   cd ..
 
-
 # Install requirements
 RUN pip install openai>=1.3 ray pydantic flufl.lock fire text_dedup wandb hydra-core geventhttpclient && \
     pip install rouge_score outlines sentencepiece transformers>=4.40 sentence-transformers>=2.2.2

diff --git a/nemo_skills/inference/server/serve_vllm.sh b/nemo_skills/inference/server/serve_vllm.sh
@@ -94,7 +94,6 @@ python -m vllm.entrypoints.openai.api_server \
   --port=${SERVER_PORT} \
   --served-model-name "${MODEL_NAME}" \
   --tensor-parallel-size=${NUM_GPUS} \
-  --max-num-seqs=1024 \
   --enforce-eager \
   --disable-log-requests \
   $VLLM_GPU_MEMORY $QUANTIZATION $MAX_SEQUENCE_LENGTH
diff --git a/pipeline/summarize_results.py b/pipeline/summarize_results.py
@@ -93,8 +93,7 @@
                         evaluator=evaluator,
                         aggregation_mode="best",
                     )
-
-        except subprocess.CalledProcessError as e:
+        except:
             print(f"Error running compute_metrics.py for {benchmark}: {e}")
 
     lines_to_write = []

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,4 +20,9 @@ py_version = 310  # python 3.10 as a target version
 
 [tool.black]
 line_length = 119
-skip_string_normalization = true
+skip_string_normalization = true
+
+[tool.pytest.ini_options]
+markers = [
+    "gpu: tests that require a GPU to run",
+]
diff --git a/tests/gpu-tests/run.sh b/tests/gpu-tests/run.sh
@@ -1,6 +1,6 @@
-# will run all tests starting from only a HF checkpoint. Assumes 2 gpus on the machine
+# will run all tests starting from only a HF checkpoint. Only requires 1 GPU.
 # also need to define HF_TOKEN for some of the tests
-# example: HF_TOKEN=<> ./tests/gpu-tests/run.sh /mnt/datadrive/models/Meta-Llama-3-8B
+# example: HF_TOKEN=<> ./tests/gpu-tests/run.sh /mnt/datadrive/models/Meta-Llama-3-8B /mnt/datadrive/models/Meta-Llama-3-8B-Instruct
 set -e
 
 if [ $# -eq 0 ] ; then
@@ -13,18 +13,23 @@ export NEMO_SKILLS_TEST_OUTPUT=/tmp/nemo_skills_test_output
 mkdir -p $NEMO_SKILLS_TEST_OUTPUT
 
 # first running the conversion tests
-pytest tests/gpu-tests/test_conversion.py -k test_hf_trtllm_conversion -s -x
+pytest tests/gpu-tests/test_conversion.py -k test_hf_trtllm_conversion -s
 export NEMO_SKILLS_TEST_TRTLLM_MODEL=$NEMO_SKILLS_TEST_OUTPUT/trtllm-model
-pytest tests/gpu-tests/test_conversion.py -k test_hf_nemo_conversion -s -x
+pytest tests/gpu-tests/test_conversion.py -k test_hf_nemo_conversion -s
 export NEMO_SKILLS_TEST_NEMO_MODEL=$NEMO_SKILLS_TEST_OUTPUT/model.nemo
-pytest tests/gpu-tests/test_conversion.py -k test_nemo_hf_conversion -s -x
+pytest tests/gpu-tests/test_conversion.py -k test_nemo_hf_conversion -s
 # using the back-converted model to check that it's reasonable
 export NEMO_SKILLS_TEST_HF_MODEL=$NEMO_SKILLS_TEST_OUTPUT/hf-model
 
+export LLAMA3_8B_BASE_TRTLLM=$NEMO_SKILLS_TEST_TRTLLM_MODEL
+export LLAMA3_8B_BASE_NEMO=$NEMO_SKILLS_TEST_NEMO_MODEL
+export LLAMA3_8B_BASE_HF=$NEMO_SKILLS_TEST_HF_MODEL
+export LLAMA3_8B_INSTRUCT_HF=$2
+
 # then running the rest of the tests
-pytest tests/gpu-tests/test_generation.py -s -x
+pytest tests/gpu-tests/test_generation.py -s
 
-# # for sft we are using the tiny random llama model to run much faster
+# for sft we are using the tiny random llama model to run much faster
 python pipeline/launcher.py \
     --cmd "HF_TOKEN=$HF_TOKEN python /code/tests/gpu-tests/make_tiny_llama.py" \
     --tasks_per_node 1 \
@@ -34,10 +39,10 @@ python pipeline/launcher.py \
 
 # converting the model through test
 export NEMO_SKILLS_TEST_HF_MODEL=$NEMO_SKILLS_TEST_OUTPUT/tiny-llama
-pytest tests/gpu-tests/test_conversion.py -k test_hf_nemo_conversion -s -x
+pytest tests/gpu-tests/test_conversion.py -k test_hf_nemo_conversion -s
 # untarring model which is required for checkpoint averaging
 mkdir -p $NEMO_SKILLS_TEST_OUTPUT/untarred_nemo
 tar xvf $NEMO_SKILLS_TEST_OUTPUT/model.nemo -C $NEMO_SKILLS_TEST_OUTPUT/untarred_nemo
 export NEMO_SKILLS_TEST_NEMO_MODEL=$NEMO_SKILLS_TEST_OUTPUT/untarred_nemo
 # running finetuning
-pytest tests/gpu-tests/test_finetuning.py -s -x
+pytest tests/gpu-tests/test_finetuning.py -s
diff --git a/tests/gpu-tests/test_conversion.py b/tests/gpu-tests/test_conversion.py
@@ -25,6 +25,7 @@
 from launcher import CLUSTER_CONFIG, NEMO_SKILLS_CODE, launch_job
 
 
+@pytest.mark.gpu
 def test_hf_trtllm_conversion():
     model_path = os.getenv('NEMO_SKILLS_TEST_HF_MODEL')
     if not model_path:
@@ -36,7 +37,7 @@ def test_hf_trtllm_conversion():
     --model_dir /model \
     --output_dir /tmp/trtllm \
     --dtype float16 \
-    --tp_size 2 \
+    --tp_size 1 \
 && trtllm-build \
     --checkpoint_dir /tmp/trtllm \
     --output_dir /output/trtllm-model \
@@ -62,6 +63,7 @@ def test_hf_trtllm_conversion():
     )
 
 
+@pytest.mark.gpu
 def test_hf_nemo_conversion():
     model_path = os.getenv('NEMO_SKILLS_TEST_HF_MODEL')
     if not model_path:
@@ -87,13 +89,16 @@ def test_hf_nemo_conversion():
     )
 
 
+@pytest.mark.gpu
 def test_nemo_hf_conversion():
     model_path = os.getenv('NEMO_SKILLS_TEST_NEMO_MODEL')
     if not model_path:
         pytest.skip("Define NEMO_SKILLS_TEST_NEMO_MODEL to run this test")
     output_path = os.getenv('NEMO_SKILLS_TEST_OUTPUT', '/tmp')
 
-    cmd = f"""cd /code && \
+    # there is a bug in transformers related to slurm, so unsetting the vars
+    # TODO: remove this once the bug is fixed
+    cmd = f"""cd /code && unset SLURM_PROCID && unset SLURM_LOCALID && \
 HF_TOKEN={os.environ['HF_TOKEN']} python nemo_skills/conversion/nemo_to_hf.py \
     --in-path /model \
     --out-path /output/hf-model \

diff --git a/tests/gpu-tests/test_finetuning.py b/tests/gpu-tests/test_finetuning.py
@@ -28,6 +28,7 @@
 from nemo_skills.evaluation.metrics import MathEval, compute_metrics
 
 
+@pytest.mark.gpu
 def test_sft_pipeline():
     model_path = os.getenv('NEMO_SKILLS_TEST_NEMO_MODEL')
     if not model_path:
@@ -42,15 +43,15 @@ def test_sft_pipeline():
       --expname test \
       --nemo_model {model_path} \
       --num_nodes 1 \
-      --num_gpus 2 \
+      --num_gpus 1 \
       --disable_wandb \
       --extra_eval_args "+prompt=openmathinstruct/sft ++max_samples=4 --benchmarks gsm8k:1 math:0 --num_jobs 1 --num_gpus 1" \
       ++model.data.train_ds.file_path=/data/gsm8k/validation-sft.jsonl \
       ++trainer.sft.max_steps=15 \
       ++trainer.sft.val_check_interval=10 \
       ++trainer.sft.limit_val_batches=2 \
       ++model.data.train_ds.global_batch_size=4 \
-      ++model.tensor_model_parallel_size=2 \
+      ++model.tensor_model_parallel_size=1 \
       ++model.pipeline_model_parallel_size=1 \
       ++model.optim.lr=1e-6 \
 """