extended capabilities for customization of compile_cuda_script

gpu-mode · Jan 20, 2025 · aa39ff1 · aa39ff1
1 parent 63453bb
commit aa39ff1
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 18 deletions.
diff --git a/examples/identity_cuda/eval.cu b/examples/identity_cuda/eval.cu
@@ -0,0 +1,149 @@
+#include <chrono>
+#include <iostream>
+#include <cstdint>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <memory>
+
+#include "reference.cuh"
+#include "submission.cuh"
+
+#define WARMUP_RUNS 10
+#define TIMED_RUNS 100
+
+struct Closer {
+    void operator()(std::FILE* file) {
+        std::fclose(file);
+    }
+};
+
+struct PopcornOutput {
+    template<class... Args>
+    void printf(Args&&... args) {
+        ::fprintf(File.get(), std::forward<Args>(args)...);
+    }
+
+    void log(const char* key, const char* value) {
+        printf("%s: %s\n", key, value);
+    }
+
+    template<class T>
+    void log(const char* key, T&& value) {
+        log(key, std::to_string(value).c_str());
+    }
+
+    std::unique_ptr<std::FILE, Closer> File;
+};
+
+// checks that a CUDA API call returned successfully, otherwise prints an error message and exits.
+static void cuda_check(cudaError_t status, const char* expr, const char* file, int line, const char* function)
+{
+    if(status != cudaSuccess) {
+        std::cerr << "CUDA error (" << (int)status << ") while evaluating expression "
+                  << expr << " at "
+                  << file << '('
+                  << line << ") in `"
+                  << function << "`: "
+                  << cudaGetErrorString(status) << std::endl;
+        std::exit(110);
+    }
+}
+
+#define cuda_check(expr) cuda_check(expr, #expr, __FILE__, __LINE__, __FUNCTION__)
+
+void measure_runtime(PopcornOutput& logger, std::mt19937& rng) {
+    std::cout << "warming up..." << std::endl;
+
+    {
+        auto warmup_data = generate_input(rng());
+        for (int i = 0; i < WARMUP_RUNS; i++) {
+            // discard result; this is just warmup, we don't care what it returns
+            (void)custom_kernel(warmup_data);
+            cuda_check(cudaDeviceSynchronize());
+        }
+    }
+
+    std::vector<std::int64_t> durations;
+    durations.reserve(TIMED_RUNS);
+
+    for (int i = 0; i < TIMED_RUNS; i++) {
+        auto data = generate_input(rng());
+
+        // make a copy of the input data to be used by the reference implementation
+        auto copy = data;
+
+        auto start = std::chrono::high_resolution_clock::now();
+        // move data into custom_kernel, so that if custom_kernel takes large std::vectors or similar by value,
+        // we're not measuring the copy overhead.
+        auto submission_output = custom_kernel(std::move(data));
+        cuda_check(cudaDeviceSynchronize());
+        auto end = std::chrono::high_resolution_clock::now();
+
+        durations.push_back(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count());
+
+        auto reference_output = ref_kernel(copy);
+        if (!check_implementation(submission_output, reference_output)) {
+            logger.log("check", "fail");
+            std::exit(112);
+        }
+
+    }
+
+    // calculate duration statistics
+    std::int64_t total_duration = std::accumulate(durations.begin(), durations.end(), (std::int64_t)0);
+    std::int64_t best = *std::min_element(durations.begin(), durations.end());
+    std::int64_t worst = *std::max_element(durations.begin(), durations.end());
+    double average_duration = (double)total_duration / TIMED_RUNS;
+
+    double variance = 0.0;
+    for(auto d : durations) {
+        variance += std::pow((double)d - average_duration, 2);
+    }
+
+    // sample standard deviation with Bessel's correction
+    double standard_deviation = std::sqrt(variance / (TIMED_RUNS - 1));
+    // standard error of the mean
+    double standard_error = standard_deviation / std::sqrt(TIMED_RUNS);
+
+    logger.log("check", "pass");
+    logger.log("duration.mean", average_duration);
+    logger.log("duration.std", standard_deviation);
+    logger.log("duration.err", standard_error);
+    logger.log("duration.best", best);
+    logger.log("duration.worst", worst);
+
+
+    std::cout << "average kernel runtime: " << average_duration / 1e6 << " ± " << standard_error / 1e6 << " µs" << std::endl;
+}
+
+int main() {
+    const char *output_fd = std::getenv("POPCORN_FD");
+    PopcornOutput logger;
+    if (output_fd) {
+        int fd = std::stoi(output_fd);
+        logger.File.reset(::fdopen(fd, "w"));
+    } else {
+        return 111;
+    }
+
+    // get the seed
+    const char *seed_str = std::getenv("POPCORN_SEED");
+    int seed = 42;
+    if (seed_str) {
+        seed = std::stoi(output_fd);
+    }
+
+    std::mt19937 rng(seed);
+    auto data = generate_input(rng());
+    auto reference_output = ref_kernel(data);
+    auto submission_output = custom_kernel(data);
+
+    if (!check_implementation(submission_output, reference_output)) {
+        logger.log("check", "fail");
+        return 112;
+    }
+
+    measure_runtime(logger, rng);
+    return 0;
+}
diff --git a/examples/identity_cuda/task.yml b/examples/identity_cuda/task.yml
@@ -3,7 +3,7 @@
 
 # these files will be baked into the json object, so that they are available during testing
 files:
-  - {"name": "eval.cu", "source": "@EVAL_CU@"}
+  - {"name": "eval.cu", "source": "eval.cu"}
   - {"name": "reference.cuh", "source": "reference.cuh"}
   - {"name": "submission.cuh", "source": "@SUBMISSION@"}
 

diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
@@ -92,4 +92,4 @@ def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:
     "-Xptxas=--verbose",
     "-Xptxas=--warn-on-spills",
 ]
-MODAL_CUDA_INCLUDE_DIRS = ["-I/ThunderKittens/include"]
+MODAL_CUDA_INCLUDE_DIRS = ["/ThunderKittens/include"]
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
@@ -53,6 +53,8 @@ def compile_cuda_script(  # # noqa: C901
     files: list[str],
     arch: int = None,
     include_dirs: list[str] = None,
+    defines: dict[str, str] = None,
+    flags: list[str] = None,
     verbose: bool = False,
 ) -> CompileResult:
     """
@@ -62,14 +64,34 @@ def compile_cuda_script(  # # noqa: C901
         files: List of files to compile.
         arch: Architecture to compile for. If None, uses `native`
         include_dirs: additional include directories to supply to nvcc
+        defines: Additional defines for the preprocessor
+        flags: Other compiler flags
         verbose: whether to print progress or be silent
-        seed: Seed value to use for generating test cases
     Returns:
         A `CompileResult` that summarizes the compilation process.
 
     """
-    if include_dirs is None:
-        include_dirs = []
+    if flags is None:
+        flags = CUDA_FLAGS
+    else:
+        for flag in flags:
+            if not flag.startswith("-"):
+                raise ValueError(f"Flag `{flag}` should start with a dash.")
+        flags = CUDA_FLAGS + flags
+
+    if include_dirs is not None:
+        flags += [f"-I{d}" for d in include_dirs]
+
+    if defines is not None:
+        for name, value in defines.items():
+            # restrict macro names to valid identifiers
+            if not name.isidentifier():
+                raise ValueError(f"Define key `{name}` contains invalid character")
+
+            if value is not None:
+                flags.append(f"-D{name}={value}")
+            else:
+                flags.append(f"-D{name}")
 
     if verbose:
         print_ = print
@@ -98,7 +120,7 @@ def compile_cuda_script(  # # noqa: C901
     else:
         ARCH = f"-gencode=arch=compute_{arch},code=sm_{arch}"
 
-    command = [nvcc] + CUDA_FLAGS + include_dirs + files + [ARCH, "-o", "eval.out"]
+    command = [nvcc] + flags + files + [ARCH, "-o", "eval.out"]
 
     print_("[Compiling]")
     try:
@@ -174,6 +196,8 @@ def run_cuda_script(  # # noqa: C901
     headers: dict[str, str] = None,
     arch: int = None,
     include_dirs: list[str] = None,
+    defines: dict[str, str] = None,
+    flags: list[str] = None,
     seed: int = 42,
 ) -> tuple[CompileResult, RunResult]:
     """
@@ -186,14 +210,13 @@ def run_cuda_script(  # # noqa: C901
             compile command.
         arch: The arch code for the compute/sm versions. If None, native arch is used.
         include_dirs: Additional include directories, e.g., for thunderkittens/cutlass etc
+        defines: Preprocessor defines
+        flags: Additional flags to give to the compiler
         seed: Random seed to initialize the RNG for testing
 
     Returns:
         tuple[CompileResult, RunResult]: CUDA compile/eval result information
     """
-    if include_dirs is None:
-        include_dirs = []
-
     try:
         # Write submission files to directory
         for source, content in sources.items():
@@ -206,6 +229,8 @@ def run_cuda_script(  # # noqa: C901
             files=list(sources.keys()),
             arch=arch,
             include_dirs=include_dirs,
+            defines=defines,
+            flags=flags,
             verbose=True,
         )
 
@@ -236,7 +261,6 @@ def run_cuda_script(  # # noqa: C901
 def run_pytorch_script(  # noqa: C901
     sources: dict[str, str],
     main: str,
-    arch: int = None,
     seed: int = 42,
 ) -> RunResult:
     """
@@ -245,7 +269,6 @@ def run_pytorch_script(  # noqa: C901
     Args:
         sources: Files to generate
         main: Which file to run. Must be one of the keys in sources.
-        arch: The arch code for the compute/sm versions.
         seed: Random seed to initialize the RNG for testing
 
     Returns:
@@ -267,15 +290,14 @@ def run_pytorch_script(  # noqa: C901
 
 def run_config(config: dict):
     if config["lang"] == "py":
-        run_result = run_pytorch_script(
-            sources=config["sources"], main=config["main"], arch=config.get("arch", None)
-        )
+        run_result = run_pytorch_script(sources=config["sources"], main=config["main"])
         return FullResult(success=True, error="", compile=None, run=run_result)
     elif config["lang"] == "cu":
         comp, run = run_cuda_script(
             sources=config["sources"],
             headers=config.get("headers", {}),
             arch=config.get("arch", None),
+            defines=config.get("defines", {}),
             include_dirs=config.get("include_dirs", []),
         )
         return FullResult(success=True, error="", compile=comp, run=run)

diff --git a/src/discord-cluster-manager/task.py b/src/discord-cluster-manager/task.py
@@ -86,10 +86,6 @@ def make_task(yaml_file: str) -> LeaderboardTask:
         if source == "@SUBMISSION@":
             assert user_file_name is None
             file_dict[name] = "@SUBMISSION@"
-        elif source == "@EVAL_CU@":
-            file_dict[name] = leaderboard_eval.cu_eval
-        elif source == "@EVAL_PY@":
-            file_dict[name] = leaderboard_eval.py_eval
         else:
             file_dict[name] = Path(source).read_text()