gpu-mode · ngc92 · Jan 18, 2025 · Jan 18, 2025 · Jan 19, 2025 · Jan 19, 2025
diff --git a/examples/eval.cu b/examples/eval.cu
@@ -0,0 +1,138 @@
+#include <chrono>
+#include <iostream>
+#include <cstdint>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <memory>
+
+#include "utils.h"
+#include "reference.cuh"
+
+// forward declaration for user submission
+output_t custom_kernel(input_t data);
+
+#define WARMUP_RUNS 10
+#define TIMED_RUNS 100
+
+namespace {
+    struct Closer {
+        void operator()(std::FILE* file) {
+            std::fclose(file);
+        }
+    };
+
+    struct PopcornOutput {
+        template<class... Args>
+        void printf(Args&&... args) {
+            ::fprintf(File.get(), std::forward<Args>(args)...);
+        }
+
+        void log(const char* key, const char* value) {
+            printf("%s: %s\n", key, value);
+        }
+
+        template<class T>
+        void log(const char* key, T&& value) {
+            log(key, std::to_string(value).c_str());
+        }
+
+        std::unique_ptr<std::FILE, Closer> File;
+    };
+}
+
+static void measure_runtime(PopcornOutput& logger, std::mt19937& rng) {
+    std::cout << "warming up..." << std::endl;
+
+    {
+        auto warmup_data = generate_input(rng());
+        for (int i = 0; i < WARMUP_RUNS; i++) {
+            // discard result; this is just warmup, we don't care what it returns
+            (void)custom_kernel(warmup_data);
+            CUDA_CHECK(cudaDeviceSynchronize());
+        }
+    }
+
+    std::vector<std::int64_t> durations;
+    durations.reserve(TIMED_RUNS);
+
+    for (int i = 0; i < TIMED_RUNS; i++) {
+        auto data = generate_input(rng());
+
+        // make a copy of the input data to be used by the reference implementation
+        auto copy = data;
+
+        auto start = std::chrono::high_resolution_clock::now();
+        // move data into custom_kernel, so that if custom_kernel takes large std::vectors or similar by value,
+        // we're not measuring the copy overhead.
+        auto submission_output = custom_kernel(std::move(data));
+        CUDA_CHECK(cudaDeviceSynchronize());
+        auto end = std::chrono::high_resolution_clock::now();
+
+        durations.push_back(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count());
+
+        auto reference_output = ref_kernel(copy);
+        if (!check_implementation(submission_output, reference_output)) {
+            logger.log("check", "fail");
+            std::exit(112);
+        }
+
+    }
+
+    // calculate duration statistics
+    std::int64_t total_duration = std::accumulate(durations.begin(), durations.end(), (std::int64_t)0);
+    std::int64_t best = *std::min_element(durations.begin(), durations.end());
+    std::int64_t worst = *std::max_element(durations.begin(), durations.end());
+    double average_duration = (double)total_duration / TIMED_RUNS;
+
+    double variance = 0.0;
+    for(auto d : durations) {
+        variance += std::pow((double)d - average_duration, 2);
+    }
+
+    // sample standard deviation with Bessel's correction
+    double standard_deviation = std::sqrt(variance / (TIMED_RUNS - 1));
+    // standard error of the mean
+    double standard_error = standard_deviation / std::sqrt(TIMED_RUNS);
+
+    logger.log("check", "pass");
+    logger.log("duration.mean", average_duration);
+    logger.log("duration.std", standard_deviation);
+    logger.log("duration.err", standard_error);
+    logger.log("duration.best", best);
+    logger.log("duration.worst", worst);
+
+
+    std::cout << "average kernel runtime: " << average_duration / 1e6 << " ± " << standard_error / 1e6 << " µs" << std::endl;
+}
+
+int main() {
+    const char *output_fd = std::getenv("POPCORN_FD");
+    PopcornOutput logger;
+    if (output_fd) {
+        int fd = std::stoi(output_fd);
+        logger.File.reset(::fdopen(fd, "w"));
+    } else {
+        return 111;
+    }
+
+    // get the seed
+    const char *seed_str = std::getenv("POPCORN_SEED");
+    int seed = 42;
+    if (seed_str) {
+        seed = std::stoi(output_fd);
+    }
+
+    std::mt19937 rng(seed);
+    auto data = generate_input(rng());
+    auto reference_output = ref_kernel(data);
+    auto submission_output = custom_kernel(data);
+
+    if (!check_implementation(submission_output, reference_output)) {
+        logger.log("check", "fail");
+        return 112;
+    }
+
+    measure_runtime(logger, rng);
+    return 0;
+}
diff --git a/examples/eval.py b/examples/eval.py
@@ -0,0 +1,94 @@
+import time
+import os
+import sys
+import math
+
+from utils import set_seed
+from submission import custom_kernel
+from reference import ref_kernel, check_implementation, generate_input
+
+WARMUP_RUNS = 10
+TIMED_RUNS = 100
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+def measure_runtime(logger: PopcornOutput):
+    print("warming up...")
+
+    warmup_data = generate_input()
+    for _ in range(WARMUP_RUNS):
+        custom_kernel(warmup_data)
+
+    durations = []
+
+    for _ in range(TIMED_RUNS):
+        data = generate_input()
+        start = time.time()
+        submission_output = custom_kernel(data)
+        end = time.time()
+        durations.append((end - start) * 1e9)
+
+        reference_output = ref_kernel(data)
+        if not check_implementation(submission_output, reference_output):
+            logger.log("check", "fail")
+            sys.exit(112)
+
+    total_duration = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+    average_duration = total_duration / TIMED_RUNS
+
+    variance = sum([(d - average_duration) ** 2 for d in durations])
+    standard_deviation = math.sqrt(variance / (TIMED_RUNS - 1))
+    standard_error = standard_deviation / math.sqrt(TIMED_RUNS)
+
+    logger.log("check", "pass")
+    logger.log("duration.mean", average_duration)
+    logger.log("duration.std", standard_deviation)
+    logger.log("duration.err", standard_error)
+    logger.log("duration.best", best)
+    logger.log("duration.worst", worst)
+
+    print(f"average kernel runtime: {average_duration / 1e6} ± {standard_error / 1e6} µs")
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    with PopcornOutput(int(fd)) as logger:
+        seed = os.getenv("POPCORN_SEED")
+        seed = int(seed) if seed else 42
+
+        set_seed(seed)
+        data = generate_input()
+        reference_output = ref_kernel(data)
+        submission_output = custom_kernel(data)
+
+        if not check_implementation(submission_output, reference_output):
+            logger.log("check", "fail")
+            return 112
+
+        measure_runtime(logger)
+        return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/identity_cuda/eval.cu b/examples/identity_cuda/eval.cu
@@ -0,0 +1 @@
+../eval.cu
diff --git a/examples/identity_cuda/reference.cuh b/examples/identity_cuda/reference.cuh
@@ -9,14 +9,9 @@
 #include <random>
 #include <iostream>
 
-#define N_SIZES 10
-const int Ns[N_SIZES] = {128,  256,  512,   1024,  2048,
-                         4096, 8192, 16384, 32768, 65536};
+#include "task.h"
 
-using input_t = std::array<std::vector<float>, N_SIZES>;
-using output_t = input_t;
-
-input_t generate_input(int seed) {
+static input_t generate_input(int seed) {
   std::mt19937 rng(seed);
   input_t data;
 
@@ -33,11 +28,11 @@ input_t generate_input(int seed) {
 }
 
 // The identity kernel
-output_t ref_kernel(input_t data) {
+static output_t ref_kernel(input_t data) {
   return (output_t) data;
 }
 
-bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
+static bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
   // input_t data = generate_input();
   // output_t reference_out = reference(data);
 

diff --git a/examples/identity_cuda/submission.cuh → examples/identity_cuda/submission.cu b/examples/identity_cuda/submission.cuh → examples/identity_cuda/submission.cu
@@ -1,6 +1,7 @@
 #include <array>
 #include <vector>
-#include "reference.cuh"
+#include "task.h"
+#include "utils.h"
 
 __global__ void copy_kernel(float *input, float *output, int N)
 {
@@ -22,23 +23,23 @@ output_t custom_kernel(input_t data)
 
         // Allocate device memory
         float *d_input, *d_output;
-        cudaMalloc(&d_input, N * sizeof(float));
-        cudaMalloc(&d_output, N * sizeof(float));
+        CUDA_CHECK(cudaMalloc(&d_input, N * sizeof(float)));
+        CUDA_CHECK(cudaMalloc(&d_output, N * sizeof(float)));
 
         // Copy input to device
-        cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice);
+        CUDA_CHECK(cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice));
 
         // Launch kernel
         int blockSize = 256;
         int numBlocks = (N + blockSize - 1) / blockSize;
         copy_kernel<<<numBlocks, blockSize>>>(d_input, d_output, N);
 
         // Copy result back to host
-        cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost);
+        CUDA_CHECK(cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost));
 
         // Free device memory
-        cudaFree(d_input);
-        cudaFree(d_output);
+        CUDA_CHECK(cudaFree(d_input));
+        CUDA_CHECK(cudaFree(d_output));
     }
 
     return result;

diff --git a/examples/identity_cuda/task.h b/examples/identity_cuda/task.h
@@ -0,0 +1,14 @@
+#ifndef __TASK_H__
+#define __TASK_H__
+
+#include <vector>
+#include <array>
+
+#define N_SIZES 10
+const int Ns[N_SIZES] = {128,  256,  512,   1024,  2048,
+                         4096, 8192, 16384, 32768, 65536};
+
+using input_t = std::array<std::vector<float>, N_SIZES>;
+using output_t = input_t;
+
+#endif
diff --git a/examples/identity_cuda/task.yml b/examples/identity_cuda/task.yml
@@ -0,0 +1,24 @@
+# name of the task
+# name: identity-cuda
+
+# these files will be baked into the json object, so that they are available during testing
+files:
+  - {"name": "eval.cu", "source": "eval.cu"}
+  - {"name": "task.h", "source": "task.h"}
+  - {"name": "utils.h", "source": "utils.h"}
+  - {"name": "reference.cuh", "source": "reference.cuh"}
+  - {"name": "submission.cu", "source": "@SUBMISSION@"}
+
+# task language, depending on this we do get different keys in runner
+lang: "cu"
+
+description:
+  A simple test task
+
+# Config object
+config:
+  # task provided source files to compile
+  sources: ["eval.cu", "submission.cu"]
+
+  # additional include directories
+  include_dirs: []
diff --git a/examples/identity_cuda/utils.h b/examples/identity_cuda/utils.h
@@ -0,0 +1 @@
+../utils.h
diff --git a/examples/identity_py/eval.py b/examples/identity_py/eval.py
@@ -0,0 +1 @@
+../eval.py