Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Task definitions as json strings #145

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions examples/eval.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#include <chrono>
#include <iostream>
#include <cstdint>
#include <vector>
#include <numeric>
#include <algorithm>
#include <memory>

#include "utils.h"
#include "reference.cuh"

// forward declaration for user submission
output_t custom_kernel(input_t data);

#define WARMUP_RUNS 10
#define TIMED_RUNS 100

namespace {
struct Closer {
void operator()(std::FILE* file) {
std::fclose(file);
}
};

struct PopcornOutput {
template<class... Args>
void printf(Args&&... args) {
::fprintf(File.get(), std::forward<Args>(args)...);
}

void log(const char* key, const char* value) {
printf("%s: %s\n", key, value);
}

template<class T>
void log(const char* key, T&& value) {
log(key, std::to_string(value).c_str());
}

std::unique_ptr<std::FILE, Closer> File;
};
}

static void measure_runtime(PopcornOutput& logger, std::mt19937& rng) {
std::cout << "warming up..." << std::endl;

{
auto warmup_data = generate_input(rng());
for (int i = 0; i < WARMUP_RUNS; i++) {
// discard result; this is just warmup, we don't care what it returns
(void)custom_kernel(warmup_data);
CUDA_CHECK(cudaDeviceSynchronize());
}
}

std::vector<std::int64_t> durations;
durations.reserve(TIMED_RUNS);

for (int i = 0; i < TIMED_RUNS; i++) {
auto data = generate_input(rng());

// make a copy of the input data to be used by the reference implementation
auto copy = data;

auto start = std::chrono::high_resolution_clock::now();
// move data into custom_kernel, so that if custom_kernel takes large std::vectors or similar by value,
// we're not measuring the copy overhead.
auto submission_output = custom_kernel(std::move(data));
CUDA_CHECK(cudaDeviceSynchronize());
auto end = std::chrono::high_resolution_clock::now();

durations.push_back(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count());

auto reference_output = ref_kernel(copy);
if (!check_implementation(submission_output, reference_output)) {
logger.log("check", "fail");
std::exit(112);
}

}

// calculate duration statistics
std::int64_t total_duration = std::accumulate(durations.begin(), durations.end(), (std::int64_t)0);
std::int64_t best = *std::min_element(durations.begin(), durations.end());
std::int64_t worst = *std::max_element(durations.begin(), durations.end());
double average_duration = (double)total_duration / TIMED_RUNS;

double variance = 0.0;
for(auto d : durations) {
variance += std::pow((double)d - average_duration, 2);
}

// sample standard deviation with Bessel's correction
double standard_deviation = std::sqrt(variance / (TIMED_RUNS - 1));
// standard error of the mean
double standard_error = standard_deviation / std::sqrt(TIMED_RUNS);

logger.log("check", "pass");
logger.log("duration.mean", average_duration);
logger.log("duration.std", standard_deviation);
logger.log("duration.err", standard_error);
logger.log("duration.best", best);
logger.log("duration.worst", worst);


std::cout << "average kernel runtime: " << average_duration / 1e6 << " ± " << standard_error / 1e6 << " µs" << std::endl;
}

int main() {
const char *output_fd = std::getenv("POPCORN_FD");
PopcornOutput logger;
if (output_fd) {
int fd = std::stoi(output_fd);
logger.File.reset(::fdopen(fd, "w"));
} else {
return 111;
}

// get the seed
const char *seed_str = std::getenv("POPCORN_SEED");
int seed = 42;
if (seed_str) {
seed = std::stoi(output_fd);
}

std::mt19937 rng(seed);
auto data = generate_input(rng());
auto reference_output = ref_kernel(data);
auto submission_output = custom_kernel(data);

if (!check_implementation(submission_output, reference_output)) {
logger.log("check", "fail");
return 112;
}

measure_runtime(logger, rng);
return 0;
}
94 changes: 94 additions & 0 deletions examples/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import time
import os
import sys
import math

from utils import set_seed
from submission import custom_kernel
from reference import ref_kernel, check_implementation, generate_input

WARMUP_RUNS = 10
TIMED_RUNS = 100


class PopcornOutput:
def __init__(self, fd: int):
self.file = os.fdopen(fd, 'w')

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.file.close()

def print(self, *args, **kwargs):
print(*args, **kwargs, file=self.file, flush=True)

def log(self, key, value):
self.print(f"{key}: {value}")


def measure_runtime(logger: PopcornOutput):
print("warming up...")

warmup_data = generate_input()
for _ in range(WARMUP_RUNS):
custom_kernel(warmup_data)

durations = []

for _ in range(TIMED_RUNS):
data = generate_input()
start = time.time()
submission_output = custom_kernel(data)
end = time.time()
durations.append((end - start) * 1e9)

reference_output = ref_kernel(data)
if not check_implementation(submission_output, reference_output):
logger.log("check", "fail")
sys.exit(112)

total_duration = sum(durations)
best = min(durations)
worst = max(durations)
average_duration = total_duration / TIMED_RUNS

variance = sum([(d - average_duration) ** 2 for d in durations])
standard_deviation = math.sqrt(variance / (TIMED_RUNS - 1))
standard_error = standard_deviation / math.sqrt(TIMED_RUNS)

logger.log("check", "pass")
logger.log("duration.mean", average_duration)
logger.log("duration.std", standard_deviation)
logger.log("duration.err", standard_error)
logger.log("duration.best", best)
logger.log("duration.worst", worst)

print(f"average kernel runtime: {average_duration / 1e6} ± {standard_error / 1e6} µs")


def main():
fd = os.getenv("POPCORN_FD")
if not fd:
return 111

with PopcornOutput(int(fd)) as logger:
seed = os.getenv("POPCORN_SEED")
seed = int(seed) if seed else 42

set_seed(seed)
data = generate_input()
reference_output = ref_kernel(data)
submission_output = custom_kernel(data)

if not check_implementation(submission_output, reference_output):
logger.log("check", "fail")
return 112

measure_runtime(logger)
return 0


if __name__ == "__main__":
sys.exit(main())
1 change: 1 addition & 0 deletions examples/identity_cuda/eval.cu
13 changes: 4 additions & 9 deletions examples/identity_cuda/reference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,9 @@
#include <random>
#include <iostream>

#define N_SIZES 10
const int Ns[N_SIZES] = {128, 256, 512, 1024, 2048,
4096, 8192, 16384, 32768, 65536};
#include "task.h"

using input_t = std::array<std::vector<float>, N_SIZES>;
using output_t = input_t;

input_t generate_input(int seed) {
static input_t generate_input(int seed) {
std::mt19937 rng(seed);
input_t data;

Expand All @@ -33,11 +28,11 @@ input_t generate_input(int seed) {
}

// The identity kernel
output_t ref_kernel(input_t data) {
static output_t ref_kernel(input_t data) {
return (output_t) data;
}

bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
static bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
// input_t data = generate_input();
// output_t reference_out = reference(data);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <array>
#include <vector>
#include "reference.cuh"
#include "task.h"
#include "utils.h"

__global__ void copy_kernel(float *input, float *output, int N)
{
Expand All @@ -22,23 +23,23 @@ output_t custom_kernel(input_t data)

// Allocate device memory
float *d_input, *d_output;
cudaMalloc(&d_input, N * sizeof(float));
cudaMalloc(&d_output, N * sizeof(float));
CUDA_CHECK(cudaMalloc(&d_input, N * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_output, N * sizeof(float)));

// Copy input to device
cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice);
CUDA_CHECK(cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice));

// Launch kernel
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
copy_kernel<<<numBlocks, blockSize>>>(d_input, d_output, N);

// Copy result back to host
cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost);
CUDA_CHECK(cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost));

// Free device memory
cudaFree(d_input);
cudaFree(d_output);
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_output));
}

return result;
Expand Down
14 changes: 14 additions & 0 deletions examples/identity_cuda/task.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef __TASK_H__
#define __TASK_H__

#include <vector>
#include <array>

#define N_SIZES 10
const int Ns[N_SIZES] = {128, 256, 512, 1024, 2048,
4096, 8192, 16384, 32768, 65536};

using input_t = std::array<std::vector<float>, N_SIZES>;
using output_t = input_t;

#endif
24 changes: 24 additions & 0 deletions examples/identity_cuda/task.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# name of the task
# name: identity-cuda

# these files will be baked into the json object, so that they are available during testing
files:
- {"name": "eval.cu", "source": "eval.cu"}
- {"name": "task.h", "source": "task.h"}
- {"name": "utils.h", "source": "utils.h"}
- {"name": "reference.cuh", "source": "reference.cuh"}
- {"name": "submission.cu", "source": "@SUBMISSION@"}

# task language, depending on this we do get different keys in runner
lang: "cu"

description:
A simple test task

# Config object
config:
# task provided source files to compile
sources: ["eval.cu", "submission.cu"]

# additional include directories
include_dirs: []
1 change: 1 addition & 0 deletions examples/identity_cuda/utils.h
1 change: 1 addition & 0 deletions examples/identity_py/eval.py
Loading