Skip to content

Commit

Permalink
extended capabilities for customization of compile_cuda_script
Browse files Browse the repository at this point in the history
  • Loading branch information
ngc92 committed Jan 20, 2025
1 parent 63453bb commit aa39ff1
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 18 deletions.
149 changes: 149 additions & 0 deletions examples/identity_cuda/eval.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#include <chrono>
#include <iostream>
#include <cstdint>
#include <vector>
#include <numeric>
#include <algorithm>
#include <memory>

#include "reference.cuh"
#include "submission.cuh"

#define WARMUP_RUNS 10
#define TIMED_RUNS 100

struct Closer {
void operator()(std::FILE* file) {
std::fclose(file);
}
};

struct PopcornOutput {
template<class... Args>
void printf(Args&&... args) {
::fprintf(File.get(), std::forward<Args>(args)...);
}

void log(const char* key, const char* value) {
printf("%s: %s\n", key, value);
}

template<class T>
void log(const char* key, T&& value) {
log(key, std::to_string(value).c_str());
}

std::unique_ptr<std::FILE, Closer> File;
};

// checks that a CUDA API call returned successfully, otherwise prints an error message and exits.
static void cuda_check(cudaError_t status, const char* expr, const char* file, int line, const char* function)
{
if(status != cudaSuccess) {
std::cerr << "CUDA error (" << (int)status << ") while evaluating expression "
<< expr << " at "
<< file << '('
<< line << ") in `"
<< function << "`: "
<< cudaGetErrorString(status) << std::endl;
std::exit(110);
}
}

#define cuda_check(expr) cuda_check(expr, #expr, __FILE__, __LINE__, __FUNCTION__)

void measure_runtime(PopcornOutput& logger, std::mt19937& rng) {
std::cout << "warming up..." << std::endl;

{
auto warmup_data = generate_input(rng());
for (int i = 0; i < WARMUP_RUNS; i++) {
// discard result; this is just warmup, we don't care what it returns
(void)custom_kernel(warmup_data);
cuda_check(cudaDeviceSynchronize());
}
}

std::vector<std::int64_t> durations;
durations.reserve(TIMED_RUNS);

for (int i = 0; i < TIMED_RUNS; i++) {
auto data = generate_input(rng());

// make a copy of the input data to be used by the reference implementation
auto copy = data;

auto start = std::chrono::high_resolution_clock::now();
// move data into custom_kernel, so that if custom_kernel takes large std::vectors or similar by value,
// we're not measuring the copy overhead.
auto submission_output = custom_kernel(std::move(data));
cuda_check(cudaDeviceSynchronize());
auto end = std::chrono::high_resolution_clock::now();

durations.push_back(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count());

auto reference_output = ref_kernel(copy);
if (!check_implementation(submission_output, reference_output)) {
logger.log("check", "fail");
std::exit(112);
}

}

// calculate duration statistics
std::int64_t total_duration = std::accumulate(durations.begin(), durations.end(), (std::int64_t)0);
std::int64_t best = *std::min_element(durations.begin(), durations.end());
std::int64_t worst = *std::max_element(durations.begin(), durations.end());
double average_duration = (double)total_duration / TIMED_RUNS;

double variance = 0.0;
for(auto d : durations) {
variance += std::pow((double)d - average_duration, 2);
}

// sample standard deviation with Bessel's correction
double standard_deviation = std::sqrt(variance / (TIMED_RUNS - 1));
// standard error of the mean
double standard_error = standard_deviation / std::sqrt(TIMED_RUNS);

logger.log("check", "pass");
logger.log("duration.mean", average_duration);
logger.log("duration.std", standard_deviation);
logger.log("duration.err", standard_error);
logger.log("duration.best", best);
logger.log("duration.worst", worst);


std::cout << "average kernel runtime: " << average_duration / 1e6 << " ± " << standard_error / 1e6 << " µs" << std::endl;
}

int main() {
const char *output_fd = std::getenv("POPCORN_FD");
PopcornOutput logger;
if (output_fd) {
int fd = std::stoi(output_fd);
logger.File.reset(::fdopen(fd, "w"));
} else {
return 111;
}

// get the seed
const char *seed_str = std::getenv("POPCORN_SEED");
int seed = 42;
if (seed_str) {
seed = std::stoi(output_fd);
}

std::mt19937 rng(seed);
auto data = generate_input(rng());
auto reference_output = ref_kernel(data);
auto submission_output = custom_kernel(data);

if (!check_implementation(submission_output, reference_output)) {
logger.log("check", "fail");
return 112;
}

measure_runtime(logger, rng);
return 0;
}
2 changes: 1 addition & 1 deletion examples/identity_cuda/task.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# these files will be baked into the json object, so that they are available during testing
files:
- {"name": "eval.cu", "source": "@EVAL_CU@"}
- {"name": "eval.cu", "source": "eval.cu"}
- {"name": "reference.cuh", "source": "reference.cuh"}
- {"name": "submission.cuh", "source": "@SUBMISSION@"}

Expand Down
2 changes: 1 addition & 1 deletion src/discord-cluster-manager/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,4 @@ def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:
"-Xptxas=--verbose",
"-Xptxas=--warn-on-spills",
]
MODAL_CUDA_INCLUDE_DIRS = ["-I/ThunderKittens/include"]
MODAL_CUDA_INCLUDE_DIRS = ["/ThunderKittens/include"]
46 changes: 34 additions & 12 deletions src/discord-cluster-manager/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def compile_cuda_script( # # noqa: C901
files: list[str],
arch: int = None,
include_dirs: list[str] = None,
defines: dict[str, str] = None,
flags: list[str] = None,
verbose: bool = False,
) -> CompileResult:
"""
Expand All @@ -62,14 +64,34 @@ def compile_cuda_script( # # noqa: C901
files: List of files to compile.
arch: Architecture to compile for. If None, uses `native`
include_dirs: additional include directories to supply to nvcc
defines: Additional defines for the preprocessor
flags: Other compiler flags
verbose: whether to print progress or be silent
seed: Seed value to use for generating test cases
Returns:
A `CompileResult` that summarizes the compilation process.
"""
if include_dirs is None:
include_dirs = []
if flags is None:
flags = CUDA_FLAGS
else:
for flag in flags:
if not flag.startswith("-"):
raise ValueError(f"Flag `{flag}` should start with a dash.")
flags = CUDA_FLAGS + flags

if include_dirs is not None:
flags += [f"-I{d}" for d in include_dirs]

if defines is not None:
for name, value in defines.items():
# restrict macro names to valid identifiers
if not name.isidentifier():
raise ValueError(f"Define key `{name}` contains invalid character")

if value is not None:
flags.append(f"-D{name}={value}")
else:
flags.append(f"-D{name}")

if verbose:
print_ = print
Expand Down Expand Up @@ -98,7 +120,7 @@ def compile_cuda_script( # # noqa: C901
else:
ARCH = f"-gencode=arch=compute_{arch},code=sm_{arch}"

command = [nvcc] + CUDA_FLAGS + include_dirs + files + [ARCH, "-o", "eval.out"]
command = [nvcc] + flags + files + [ARCH, "-o", "eval.out"]

print_("[Compiling]")
try:
Expand Down Expand Up @@ -174,6 +196,8 @@ def run_cuda_script( # # noqa: C901
headers: dict[str, str] = None,
arch: int = None,
include_dirs: list[str] = None,
defines: dict[str, str] = None,
flags: list[str] = None,
seed: int = 42,
) -> tuple[CompileResult, RunResult]:
"""
Expand All @@ -186,14 +210,13 @@ def run_cuda_script( # # noqa: C901
compile command.
arch: The arch code for the compute/sm versions. If None, native arch is used.
include_dirs: Additional include directories, e.g., for thunderkittens/cutlass etc
defines: Preprocessor defines
flags: Additional flags to give to the compiler
seed: Random seed to initialize the RNG for testing
Returns:
tuple[CompileResult, RunResult]: CUDA compile/eval result information
"""
if include_dirs is None:
include_dirs = []

try:
# Write submission files to directory
for source, content in sources.items():
Expand All @@ -206,6 +229,8 @@ def run_cuda_script( # # noqa: C901
files=list(sources.keys()),
arch=arch,
include_dirs=include_dirs,
defines=defines,
flags=flags,
verbose=True,
)

Expand Down Expand Up @@ -236,7 +261,6 @@ def run_cuda_script( # # noqa: C901
def run_pytorch_script( # noqa: C901
sources: dict[str, str],
main: str,
arch: int = None,
seed: int = 42,
) -> RunResult:
"""
Expand All @@ -245,7 +269,6 @@ def run_pytorch_script( # noqa: C901
Args:
sources: Files to generate
main: Which file to run. Must be one of the keys in sources.
arch: The arch code for the compute/sm versions.
seed: Random seed to initialize the RNG for testing
Returns:
Expand All @@ -267,15 +290,14 @@ def run_pytorch_script( # noqa: C901

def run_config(config: dict):
if config["lang"] == "py":
run_result = run_pytorch_script(
sources=config["sources"], main=config["main"], arch=config.get("arch", None)
)
run_result = run_pytorch_script(sources=config["sources"], main=config["main"])
return FullResult(success=True, error="", compile=None, run=run_result)
elif config["lang"] == "cu":
comp, run = run_cuda_script(
sources=config["sources"],
headers=config.get("headers", {}),
arch=config.get("arch", None),
defines=config.get("defines", {}),
include_dirs=config.get("include_dirs", []),
)
return FullResult(success=True, error="", compile=comp, run=run)
Expand Down
4 changes: 0 additions & 4 deletions src/discord-cluster-manager/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,6 @@ def make_task(yaml_file: str) -> LeaderboardTask:
if source == "@SUBMISSION@":
assert user_file_name is None
file_dict[name] = "@SUBMISSION@"
elif source == "@EVAL_CU@":
file_dict[name] = leaderboard_eval.cu_eval
elif source == "@EVAL_PY@":
file_dict[name] = leaderboard_eval.py_eval
else:
file_dict[name] = Path(source).read_text()

Expand Down

0 comments on commit aa39ff1

Please sign in to comment.