-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'refs/remotes/origin/main'
- Loading branch information
Showing
9 changed files
with
467 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#ifndef __REFERENCE_CUH__ | ||
#define __REFERENCE_CUH__ | ||
|
||
#include <tuple> | ||
#include <vector> | ||
#include <cstdlib> | ||
#include <cmath> | ||
#include <array> | ||
|
||
#define N_SIZES 10 | ||
const int Ns[N_SIZES] = {128, 256, 512, 1024, 2048, | ||
4096, 8192, 16384, 32768, 65536}; | ||
|
||
using input_t = std::array<std::vector<float>, N_SIZES>; | ||
using output_t = input_t; | ||
|
||
input_t generate_input() { | ||
input_t data; | ||
|
||
for (int i = 0; i < N_SIZES; ++i) { | ||
data[i].resize(Ns[i]); | ||
for (int j = 0; j < Ns[i]; ++j) { | ||
data[i][j] = static_cast<float>(rand()) / RAND_MAX; | ||
} | ||
} | ||
|
||
return data; | ||
} | ||
|
||
// The identity kernel | ||
output_t reference(input_t data) { | ||
output_t out; | ||
|
||
for (int i = 0; i < N_SIZES; ++i) { | ||
out[i].resize(Ns[i]); | ||
for (int j = 0; j < Ns[i]; ++j) { | ||
out[i][j] = data[i][j]; | ||
} | ||
} | ||
|
||
return out; | ||
} | ||
|
||
bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) { | ||
// input_t data = generate_input(); | ||
// output_t reference_out = reference(data); | ||
bool same = true; | ||
|
||
for (int i = 0; i < N_SIZES; ++i) { | ||
auto ref_ptr = ref[i]; | ||
auto out_ptr = out[i]; | ||
|
||
for (int j = 0; j < Ns[i]; ++j) { | ||
if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) { | ||
same = false; | ||
break; | ||
} | ||
} | ||
if (!same) | ||
break; | ||
} | ||
|
||
return same; | ||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#include <array> | ||
#include <vector> | ||
#include "reference.cuh" | ||
|
||
__global__ void copy_kernel(float *input, float *output, int N) | ||
{ | ||
int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (idx < N) | ||
{ | ||
output[idx] = input[idx]; | ||
} | ||
} | ||
|
||
output_t submission(input_t data) | ||
{ | ||
output_t result; | ||
|
||
for (int i = 0; i < N_SIZES; ++i) | ||
{ | ||
int N = Ns[i]; | ||
result[i].resize(N); | ||
|
||
// Allocate device memory | ||
float *d_input, *d_output; | ||
cudaMalloc(&d_input, N * sizeof(float)); | ||
cudaMalloc(&d_output, N * sizeof(float)); | ||
|
||
// Copy input to device | ||
cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice); | ||
|
||
// Launch kernel | ||
int blockSize = 256; | ||
int numBlocks = (N + blockSize - 1) / blockSize; | ||
copy_kernel<<<numBlocks, blockSize>>>(d_input, d_output, N); | ||
|
||
// Copy result back to host | ||
cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost); | ||
|
||
// Free device memory | ||
cudaFree(d_input); | ||
cudaFree(d_output); | ||
} | ||
|
||
return result; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from typing import List | ||
|
||
import torch | ||
|
||
|
||
def check_implementation(custom_output, ref_output) -> bool: | ||
for c, r in zip(custom_output, ref_output, strict=False): | ||
if not torch.allclose(c, r, atol=1e-5): | ||
print("mismatch found! custom implementation doesnt match reference.") | ||
return False | ||
|
||
return True | ||
|
||
|
||
def ref_kernel(xs: List[torch.Tensor], dim: int = -1) -> torch.Tensor: | ||
""" | ||
Reference implementation of the Softmax function using PyTorch's predefined functions. | ||
Args: | ||
x (torch.Tensor): Input tensor. | ||
dim (int): Dimension along which to apply softmax. | ||
Returns: | ||
torch.Tensor: Tensor after applying Softmax. | ||
""" | ||
|
||
return [torch.nn.functional.softmax(x, dim=dim) for x in xs] | ||
|
||
|
||
def generate_input(seed: int = None, to_cuda: bool = True) -> List[torch.Tensor]: | ||
""" | ||
Generates random input tensor of the specified shape. | ||
Args: | ||
seed (int): Random seed for reproducibility. | ||
to_cuda (bool): Whether to use GPU (CUDA or ROCm) or CPU. | ||
Returns: | ||
List[torch.Tensor]: List of randomly generated tensors. | ||
""" | ||
shapes = [(128, 64), (256, 64), (512, 64)] | ||
|
||
# Determine the device | ||
if to_cuda: | ||
if torch.cuda.is_available(): # Check for NVIDIA GPU | ||
device = torch.device("cuda") | ||
elif torch.backends.mps.is_available(): # Check for AMD GPU using MPS backend | ||
device = torch.device("mps") | ||
else: | ||
print("No compatible GPU found. Falling back to CPU.") | ||
device = torch.device("cpu") | ||
else: | ||
device = torch.device("cpu") | ||
|
||
if seed is not None: | ||
torch.manual_seed(seed) | ||
|
||
tensors = [] | ||
for shape in shapes: | ||
tensors.append(torch.randn(shape, device=device)) | ||
|
||
return tensors | ||
|
||
|
||
if __name__ == "__main__": | ||
inputs = generate_input(seed=42) | ||
for idx, tensor in enumerate(inputs): | ||
print(f"Input Tensor {idx + 1} (Shape: {tensor.shape}):\n{tensor}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from typing import List | ||
|
||
import torch | ||
|
||
|
||
def custom_kernel(xs: List[torch.Tensor], dim: int = -1) -> List[torch.Tensor]: | ||
""" | ||
Custom implementation of the Softmax function. | ||
Args: | ||
x (torch.Tensor): Input tensor. | ||
dim (int): Dimension along which to apply softmax. | ||
Returns: | ||
torch.Tensor: Tensor after applying Softmax. | ||
""" | ||
res = [] | ||
for x in xs: | ||
# Shift for numerical stability | ||
x_shifted = x - torch.max(x, dim=dim, keepdim=True).values | ||
exp_x = torch.exp(x_shifted) | ||
softmax = exp_x / torch.sum(exp_x, dim=dim, keepdim=True) | ||
res.append(softmax) | ||
|
||
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#ifndef __REFERENCE_CUH__ | ||
#define __REFERENCE_CUH__ | ||
|
||
#include <tuple> | ||
#include <vector> | ||
|
||
#define N_SIZES 5 | ||
const int Ns[N_SIZES] = { | ||
32, 32, 32, 32, 32, | ||
}; | ||
|
||
using input_t = std::array<std::vector<float>, N_SIZES>; | ||
using output_t = input_t; | ||
|
||
input_t generate_input() { | ||
input_t data; | ||
|
||
for (int i = 0; i < N_SIZES; ++i) { | ||
data[i].resize(Ns[i]); | ||
for (int j = 0; j < Ns[i]; ++j) { | ||
data[i][j] = static_cast<float>(rand()) / RAND_MAX; | ||
} | ||
} | ||
|
||
return data; | ||
} | ||
|
||
output_t reference(input_t data) { | ||
output_t out; | ||
|
||
for (int i = 0; i < N_SIZES; ++i) { | ||
out[i].resize(Ns[i]); | ||
for (int j = 0; j < Ns[i]; ++j) { | ||
out[i][j] = data[i][j] + data[i][j]; | ||
} | ||
} | ||
|
||
return out; | ||
} | ||
|
||
bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) { | ||
// input_t data = generate_input(); | ||
// output_t reference_out = reference(data); | ||
bool same = true; | ||
|
||
for (int i = 0; i < N_SIZES; ++i) { | ||
auto ref_ptr = ref[i]; | ||
auto out_ptr = out[i]; | ||
|
||
for (int j = 0; j < Ns[i]; ++j) { | ||
if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) { | ||
same = false; | ||
break; | ||
} | ||
} | ||
if (!same) | ||
break; | ||
} | ||
|
||
return same; | ||
} | ||
|
||
#endif |
Oops, something went wrong.