gpu-mode · alexzhang13 · Jan 15, 2025 · Jan 13, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.github/workflows/runner.py b/.github/workflows/runner.py
@@ -13,17 +13,18 @@
 
 if config["lang"] == "cu":
     comp, run = run_cuda_script(
-        config.get("eval.cu", cu_eval),
-        config.get("reference.cuh", None),
-        config.get("submission.cuh", None),
+        {"eval.cu": cu_eval},
+        {key: config[key] for key in ["reference.cuh", "submission.cuh"] if key in config},
         arch=None,
     )
     result = {"compile": asdict(comp), "run": asdict(run)}
 else:
     run = run_pytorch_script(
-        config.get("eval.py", py_eval),
-        config.get("reference.py", None),
-        config.get("submission.py", None),
+        {
+            "eval.py": py_eval,
+            **{key: config[key] for key in ["reference.py", "submission.py"] if key in config},
+        },
+        main="eval.py",
         arch=None,
     )
     result = {"run": asdict(run)}

diff --git a/docs/docs/creating-a-leaderboard/cuda-creations.md b/docs/docs/creating-a-leaderboard/cuda-creations.md
@@ -56,7 +56,7 @@ Let's break down what's going on in this relatively short file:
 #include <iostream>
 
 #include "reference.cuh"
-#include "train.cuh"
+#include "submission.cuh"
 
 #define WARMUP_RUNS 10
 #define TIMED_RUNS 100
@@ -118,7 +118,7 @@ int main() {
     return 0;
 }
 ```
-You'll notice that we include from headers named `reference.cuh` and `train.cuh`. These are the reference
+You'll notice that we include from headers named `reference.cuh` and `submission.cuh`. These are the reference
 code and submission code respectively, just renamed to a fix module so we can include them. The
 general idea is that the evaluation code can treat the leaderboard as a basic abstraction, and only
 concern itself with three things:

diff --git a/docs/docs/creating-a-leaderboard/python-creations.md b/docs/docs/creating-a-leaderboard/python-creations.md
@@ -53,7 +53,7 @@ Let's break down what's going on in this relatively short file:
 import torch
 import time
 from reference import ref_kernel, generate_input, check_implementation
-from train import custom_kernel
+from submission import custom_kernel
 
 
 def correctness() -> bool:

diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
@@ -20,13 +20,15 @@ def test_does_not_compile():
     output_t custom_kernel(input_tt data) {   }
     """
 
-    comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
+    comp, run = run_cuda_script(
+        {"eval.cu": cu_eval}, {"reference.cuh": ref.read_text(), "submission.cuh": sub}, arch=None
+    )
     assert comp.success is False
     assert run.success is False
     assert comp.nvcc_found is True
     assert comp.exit_code != ExitCode.SUCCESS
     assert comp.stdout == ""
-    assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
+    assert 'submission.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
     assert '1 error detected in the compilation of "eval.cu".' in comp.stderr
     assert comp.command.startswith("/usr/local/cuda/bin/nvcc")
     assert "nvcc: NVIDIA (R) Cuda compiler driver" in comp.nvcc_version
@@ -52,7 +54,9 @@ def test_cuda_runtime_error():
 }
 
     """
-    comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
+    comp, run = run_cuda_script(
+        {"eval.cu": cu_eval}, {"reference.cuh": ref.read_text(), "submission.cuh": sub}, arch=None
+    )
     assert comp.success is True
     assert run.success is False
     assert run.command == "./eval.out"
@@ -80,7 +84,9 @@ def test_cuda_validation_fail():
     }
 
         """
-    comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
+    comp, run = run_cuda_script(
+        {"eval.cu": cu_eval}, {"reference.cuh": ref.read_text(), "submission.cuh": sub}, arch=None
+    )
     assert comp.success is True
     assert run.success is True
     assert run.passed is False
@@ -95,7 +101,9 @@ def test_cuda_validation_fail():
 def test_cuda_correct():
     sub = Path("examples/identity_cuda/submission.cuh").read_text()
 
-    comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
+    comp, run = run_cuda_script(
+        {"eval.cu": cu_eval}, {"reference.cuh": ref.read_text(), "submission.cuh": sub}, arch=None
+    )
     assert comp.success is True
     assert run.success is True
     assert "warming up..." in run.stdout

diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
@@ -20,7 +20,9 @@ def test_does_not_import():
     this is a syntax error
     """
 
-    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    run = run_pytorch_script(
+        {"eval.py": py_eval, "reference.py": ref.read_text(), "submission.py": sub}, "eval.py"
+    )
     assert run.success is False
     assert run.exit_code != ExitCode.SUCCESS
     assert "IndentationError: unexpected indent\n" in run.stderr
@@ -33,7 +35,12 @@ def test_error():
 def custom_kernel(input):
     return [torch.zeros_like(i) for i in input]
         """
-    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+
+    run = run_pytorch_script(
+        {"eval.py": py_eval, "reference.py": ref.read_text(), "submission.py": sub},
+        "eval.py",
+        arch=None,
+    )
     assert run.success is True
     assert run.passed is False
     assert run.command == "python eval.py"
@@ -47,7 +54,9 @@ def custom_kernel(input):
 def test_correct():
     sub = Path("examples/identity_py/submission.py").read_text()
 
-    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    run = run_pytorch_script(
+        {"eval.py": py_eval, "reference.py": ref.read_text(), "submission.py": sub}, "eval.py"
+    )
     assert run.success is True
     assert "warming up..." in run.stdout
     assert run.exit_code == ExitCode.SUCCESS

diff --git a/scripts/local-test.py b/scripts/local-test.py
@@ -9,7 +9,11 @@
 ref = Path("examples/identity_cuda/reference.cuh")
 sub = Path("examples/identity_cuda/submission.cuh")
 
-cout, score = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None)
+cout, score = run_cuda_script(
+    {"eval.cu": cu_eval},
+    {"reference.cuh": ref.read_text(), "submission.cuh": sub.read_text()},
+    arch=None,
+)
 print(cout)
 print(score)
 exit(0 if score > 0 else 1)
diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py
@@ -59,7 +59,7 @@ async def run_modal(
                 "**Running on Modal...**\n> ⏳ Waiting for available GPU..."
             )
 
-            filename = "train.py" if script.filename.endswith(".py") else "train.cu"
+            filename = "submission.py" if script.filename.endswith(".py") else "train.cu"
             reference_content = None
             if reference_script is not None or reference_code is not None:
                 reference_content = (

diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
@@ -67,7 +67,7 @@ def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:
 MODAL_PATH = "/tmp/dcs/"
 MODAL_EVAL_CODE_PATH = "/tmp/dcs/eval.py"
 MODAL_REFERENCE_CODE_PATH = "/tmp/dcs/reference.py"
-MODAL_SUBMISSION_CODE_PATH = "/tmp/dcs/train.py"
+MODAL_SUBMISSION_CODE_PATH = "/tmp/dcs/submission.py"
 
 
 # Compilation flags for Modal

diff --git a/src/discord-cluster-manager/eval.cu b/src/discord-cluster-manager/eval.cu
@@ -7,7 +7,7 @@
 #include <memory>
 
 #include "reference.cuh"
-#include "train.cuh"
+#include "submission.cuh"
 
 #define WARMUP_RUNS 10
 #define TIMED_RUNS 100

diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
@@ -5,7 +5,7 @@
 
 import torch
 from reference import check_implementation, generate_input, ref_kernel
-from train import custom_kernel
+from submission import custom_kernel
 
 
 class PopcornLogger:

diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py
@@ -20,7 +20,12 @@
 
 # Move this to another file later:
 python_image = Image.debian_slim(python_version="3.10").pip_install(
-    ["torch", "triton", "jax[cuda12]", "jax2torch"]
+    [
+        "torch",
+        "triton",
+        "jax[cuda12]",
+        "jax2torch",
+    ]
 )
 
 cuda_image = (
@@ -80,10 +85,12 @@ def modal_run_pytorch_script(  # noqa: C901
     try:
         with timeout(timeout_seconds):
             run_result = run_pytorch_script(
-                script_content=script_content,
-                reference_content=reference_content,
-                submission_content=submission_content,
-                arch=arch,
+                {
+                    "eval.py": script_content,
+                    "reference.py": reference_content,
+                    "submission.py": submission_content,
+                },
+                "eval.py",
             )
             return FullResult(success=True, error="", compile=None, run=run_result)
         # TODO fixup error handling!
@@ -106,9 +113,8 @@ def modal_run_cuda_script(  # # noqa: C901
     try:
         with timeout(timeout_seconds):
             comp, run = run_cuda_script(
-                script_content,
-                reference_content=reference_content,
-                submission_content=submission_content,
+                {"eval.cu": script_content},
+                {"reference.cuh": reference_content, "submission.cuh": submission_content},
                 arch=arch,
                 include_dirs=MODAL_CUDA_INCLUDE_DIRS,
             )

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
@@ -3,7 +3,7 @@
 import shlex
 import subprocess
 import time
-from typing import Optional
+from pathlib import Path
 
 from consts import CUDA_FLAGS, ExitCode
 
@@ -169,19 +169,19 @@ def run_program(args: list[str]) -> RunResult:
 
 
 def run_cuda_script(  # # noqa: C901
-    script_content: str,
-    reference_content: str = None,
-    submission_content: str = None,
+    sources: dict[str, str],
+    headers: dict[str, str] = None,
     arch: int = None,
     include_dirs: list[str] = None,
 ) -> tuple[CompileResult, RunResult]:
     """
     Executes the provided CUDA kernel in an isolated environment
 
     Args:
-        script_content: The CUDA script containing the GPU kernel
-        reference_content: The (optional) reference code, used for leaderboards.
-        submission_content: The (optional) submission code, used for leaderboards.
+        sources: The source files to compile. Mapping file name to content.
+        headers: Additional header files to create for the compile run.
+            Mapping of file name to file contents. These files will _not_ be added to the
+            compile command.
         arch: The arch code for the compute/sm versions. If None, native arch is used.
         include_dirs: Additional include directories, e.g., for thunderkittens/cutlass etc
 
@@ -193,19 +193,14 @@ def run_cuda_script(  # # noqa: C901
 
     try:
         # Write submission files to directory
-        if reference_content is not None:
-            with open("reference.cuh", "w") as f:
-                f.write(reference_content)
+        for source, content in sources.items():
+            Path(source).write_text(content)
 
-        if submission_content is not None:
-            with open("train.cuh", "w") as f:
-                f.write(submission_content)
-
-        with open("eval.cu", "w") as f:
-            f.write(script_content)
+        for header, content in headers.items():
+            Path(header).write_text(content)
 
         compile_result = compile_cuda_script(
-            files=["eval.cu"],
+            files=list(sources.keys()),
             arch=arch,
             include_dirs=include_dirs,
             verbose=True,
@@ -226,48 +221,54 @@ def run_cuda_script(  # # noqa: C901
         run_result = run_program(["./eval.out"])
         return compile_result, run_result
 
+    # cleaning up all source files _before_ we let the user code run, just in
+    # case there's something in there that the user isn't supposed to snoop
     finally:
-        tmp_files = ["reference.cuh", "train.cuh", "eval.cu", "eval.out"]
+        tmp_files = list(sources.keys()) + list(headers.keys())
         for f in tmp_files:
             if os.path.exists(f):
                 os.remove(f)
 
+    if not compile_result.success:
+        return compile_result, RunResult(
+            success=False,
+            command="",
+            stdout="",
+            stderr="",
+            exit_code=-1,
+            duration=0.0,
+            result={},
+        )
+
+    run_result = run_program(["./eval.out"])
+    return compile_result, run_result
+
 
 def run_pytorch_script(  # noqa: C901
-    script_content: str,
-    reference_content: Optional[str] = None,
-    submission_content: Optional[str] = None,
+    sources: dict[str, str],
+    main: str,
     arch: int = None,
 ) -> RunResult:
     """
     Executes the provided PyTorch GPU kernel in an isolated environment
 
     Args:
-        script_content: The PyTorch script containing the GPU kernel to benchmark
-        reference_content: The (optional) reference code, used for leaderboards.
-        submission_content: The (optional) submission code, used for leaderboards.
+        sources: Files to generate
+        main: Which file to run. Must be one of the keys in sources.
         arch: The arch code for the compute/sm versions.
 
     Returns:
         RunResult
     """
     try:
-        # Write submission files to directory
-        if reference_content is not None:
-            with open("reference.py", "w") as f:
-                f.write(reference_content)
-
-        if submission_content is not None:
-            with open("train.py", "w") as f:
-                f.write(submission_content)
-
-        with open("eval.py", "w") as f:
-            f.write(script_content)
+        assert main in sources.keys()
 
-        return run_program(["python", "eval.py"])
+        # Write submission files to directory
+        for source, content in sources.items():
+            Path(source).write_text(content)
+        return run_program(["python", main])
 
     finally:
-        tmp_files = ["eval.py", "reference.py", "train.py"]
-        for f in tmp_files:
+        for f in sources.keys():
             if os.path.exists(f):
                 os.remove(f)