Skip to content

Commit

Permalink
Merge remote-tracking branch 'refs/remotes/origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzhang13 committed Jan 9, 2025
2 parents 0160468 + 7afd0bc commit 365d4ec
Show file tree
Hide file tree
Showing 9 changed files with 467 additions and 19 deletions.
66 changes: 66 additions & 0 deletions examples/simple_cuda/reference.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#ifndef __REFERENCE_CUH__
#define __REFERENCE_CUH__

#include <tuple>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>

#define N_SIZES 10
const int Ns[N_SIZES] = {128, 256, 512, 1024, 2048,
4096, 8192, 16384, 32768, 65536};

using input_t = std::array<std::vector<float>, N_SIZES>;
using output_t = input_t;

input_t generate_input() {
input_t data;

for (int i = 0; i < N_SIZES; ++i) {
data[i].resize(Ns[i]);
for (int j = 0; j < Ns[i]; ++j) {
data[i][j] = static_cast<float>(rand()) / RAND_MAX;
}
}

return data;
}

// The identity kernel
output_t reference(input_t data) {
output_t out;

for (int i = 0; i < N_SIZES; ++i) {
out[i].resize(Ns[i]);
for (int j = 0; j < Ns[i]; ++j) {
out[i][j] = data[i][j];
}
}

return out;
}

bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
// input_t data = generate_input();
// output_t reference_out = reference(data);
bool same = true;

for (int i = 0; i < N_SIZES; ++i) {
auto ref_ptr = ref[i];
auto out_ptr = out[i];

for (int j = 0; j < Ns[i]; ++j) {
if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) {
same = false;
break;
}
}
if (!same)
break;
}

return same;
}

#endif
45 changes: 45 additions & 0 deletions examples/simple_cuda/submission.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include <array>
#include <vector>
#include "reference.cuh"

__global__ void copy_kernel(float *input, float *output, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
output[idx] = input[idx];
}
}

output_t submission(input_t data)
{
output_t result;

for (int i = 0; i < N_SIZES; ++i)
{
int N = Ns[i];
result[i].resize(N);

// Allocate device memory
float *d_input, *d_output;
cudaMalloc(&d_input, N * sizeof(float));
cudaMalloc(&d_output, N * sizeof(float));

// Copy input to device
cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice);

// Launch kernel
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
copy_kernel<<<numBlocks, blockSize>>>(d_input, d_output, N);

// Copy result back to host
cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost);

// Free device memory
cudaFree(d_input);
cudaFree(d_output);
}

return result;
}
64 changes: 64 additions & 0 deletions examples/softmax_py/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from typing import List

import torch


def check_implementation(custom_output, ref_output) -> bool:
for c, r in zip(custom_output, ref_output, strict=False):
if not torch.allclose(c, r, atol=1e-5):
print("mismatch found! custom implementation doesnt match reference.")
return False

return True


def ref_kernel(xs: List[torch.Tensor], dim: int = -1) -> torch.Tensor:
"""
Reference implementation of the Softmax function using PyTorch's predefined functions.
Args:
x (torch.Tensor): Input tensor.
dim (int): Dimension along which to apply softmax.
Returns:
torch.Tensor: Tensor after applying Softmax.
"""

return [torch.nn.functional.softmax(x, dim=dim) for x in xs]


def generate_input(seed: int = None, to_cuda: bool = True) -> List[torch.Tensor]:
"""
Generates random input tensor of the specified shape.
Args:
seed (int): Random seed for reproducibility.
to_cuda (bool): Whether to use GPU (CUDA or ROCm) or CPU.
Returns:
List[torch.Tensor]: List of randomly generated tensors.
"""
shapes = [(128, 64), (256, 64), (512, 64)]

# Determine the device
if to_cuda:
if torch.cuda.is_available(): # Check for NVIDIA GPU
device = torch.device("cuda")
elif torch.backends.mps.is_available(): # Check for AMD GPU using MPS backend
device = torch.device("mps")
else:
print("No compatible GPU found. Falling back to CPU.")
device = torch.device("cpu")
else:
device = torch.device("cpu")

if seed is not None:
torch.manual_seed(seed)

tensors = []
for shape in shapes:
tensors.append(torch.randn(shape, device=device))

return tensors


if __name__ == "__main__":
inputs = generate_input(seed=42)
for idx, tensor in enumerate(inputs):
print(f"Input Tensor {idx + 1} (Shape: {tensor.shape}):\n{tensor}")
23 changes: 23 additions & 0 deletions examples/softmax_py/submission.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import List

import torch


def custom_kernel(xs: List[torch.Tensor], dim: int = -1) -> List[torch.Tensor]:
"""
Custom implementation of the Softmax function.
Args:
x (torch.Tensor): Input tensor.
dim (int): Dimension along which to apply softmax.
Returns:
torch.Tensor: Tensor after applying Softmax.
"""
res = []
for x in xs:
# Shift for numerical stability
x_shifted = x - torch.max(x, dim=dim, keepdim=True).values
exp_x = torch.exp(x_shifted)
softmax = exp_x / torch.sum(exp_x, dim=dim, keepdim=True)
res.append(softmax)

return res
63 changes: 63 additions & 0 deletions examples/thunderkittens_example/reference.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifndef __REFERENCE_CUH__
#define __REFERENCE_CUH__

#include <tuple>
#include <vector>

#define N_SIZES 5
const int Ns[N_SIZES] = {
32, 32, 32, 32, 32,
};

using input_t = std::array<std::vector<float>, N_SIZES>;
using output_t = input_t;

input_t generate_input() {
input_t data;

for (int i = 0; i < N_SIZES; ++i) {
data[i].resize(Ns[i]);
for (int j = 0; j < Ns[i]; ++j) {
data[i][j] = static_cast<float>(rand()) / RAND_MAX;
}
}

return data;
}

output_t reference(input_t data) {
output_t out;

for (int i = 0; i < N_SIZES; ++i) {
out[i].resize(Ns[i]);
for (int j = 0; j < Ns[i]; ++j) {
out[i][j] = data[i][j] + data[i][j];
}
}

return out;
}

bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
// input_t data = generate_input();
// output_t reference_out = reference(data);
bool same = true;

for (int i = 0; i < N_SIZES; ++i) {
auto ref_ptr = ref[i];
auto out_ptr = out[i];

for (int j = 0; j < Ns[i]; ++j) {
if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) {
same = false;
break;
}
}
if (!same)
break;
}

return same;
}

#endif
Loading

0 comments on commit 365d4ec

Please sign in to comment.