Merge remote-tracking branch 'refs/remotes/origin/main'

gpu-mode · Jan 9, 2025 · 365d4ec · 365d4ec
2 parents 0160468 + 7afd0bc
commit 365d4ec
Show file tree

Hide file tree

Showing 9 changed files with 467 additions and 19 deletions.
diff --git a/examples/simple_cuda/reference.cuh b/examples/simple_cuda/reference.cuh
@@ -0,0 +1,66 @@
+#ifndef __REFERENCE_CUH__
+#define __REFERENCE_CUH__
+
+#include <tuple>
+#include <vector>
+#include <cstdlib>
+#include <cmath>
+#include <array>
+
+#define N_SIZES 10
+const int Ns[N_SIZES] = {128,  256,  512,   1024,  2048,
+                         4096, 8192, 16384, 32768, 65536};
+
+using input_t = std::array<std::vector<float>, N_SIZES>;
+using output_t = input_t;
+
+input_t generate_input() {
+  input_t data;
+
+  for (int i = 0; i < N_SIZES; ++i) {
+    data[i].resize(Ns[i]);
+    for (int j = 0; j < Ns[i]; ++j) {
+      data[i][j] = static_cast<float>(rand()) / RAND_MAX;
+    }
+  }
+
+  return data;
+}
+
+// The identity kernel
+output_t reference(input_t data) {
+  output_t out;
+
+  for (int i = 0; i < N_SIZES; ++i) {
+    out[i].resize(Ns[i]);
+    for (int j = 0; j < Ns[i]; ++j) {
+      out[i][j] = data[i][j];
+    }
+  }
+
+  return out;
+}
+
+bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
+  // input_t data = generate_input();
+  // output_t reference_out = reference(data);
+  bool same = true;
+
+  for (int i = 0; i < N_SIZES; ++i) {
+    auto ref_ptr = ref[i];
+    auto out_ptr = out[i];
+
+    for (int j = 0; j < Ns[i]; ++j) {
+      if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) {
+        same = false;
+        break;
+      }
+    }
+    if (!same)
+      break;
+  }
+
+  return same;
+}
+
+#endif
diff --git a/examples/simple_cuda/submission.cuh b/examples/simple_cuda/submission.cuh
@@ -0,0 +1,45 @@
+#include <array>
+#include <vector>
+#include "reference.cuh"
+
+__global__ void copy_kernel(float *input, float *output, int N)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N)
+    {
+        output[idx] = input[idx];
+    }
+}
+
+output_t submission(input_t data)
+{
+    output_t result;
+
+    for (int i = 0; i < N_SIZES; ++i)
+    {
+        int N = Ns[i];
+        result[i].resize(N);
+
+        // Allocate device memory
+        float *d_input, *d_output;
+        cudaMalloc(&d_input, N * sizeof(float));
+        cudaMalloc(&d_output, N * sizeof(float));
+
+        // Copy input to device
+        cudaMemcpy(d_input, data[i].data(), N * sizeof(float), cudaMemcpyHostToDevice);
+
+        // Launch kernel
+        int blockSize = 256;
+        int numBlocks = (N + blockSize - 1) / blockSize;
+        copy_kernel<<<numBlocks, blockSize>>>(d_input, d_output, N);
+
+        // Copy result back to host
+        cudaMemcpy(result[i].data(), d_output, N * sizeof(float), cudaMemcpyDeviceToHost);
+
+        // Free device memory
+        cudaFree(d_input);
+        cudaFree(d_output);
+    }
+
+    return result;
+}
diff --git a/examples/softmax_py/reference.py b/examples/softmax_py/reference.py
@@ -0,0 +1,64 @@
+from typing import List
+
+import torch
+
+
+def check_implementation(custom_output, ref_output) -> bool:
+    for c, r in zip(custom_output, ref_output, strict=False):
+        if not torch.allclose(c, r, atol=1e-5):
+            print("mismatch found! custom implementation doesnt match reference.")
+            return False
+
+    return True
+
+
+def ref_kernel(xs: List[torch.Tensor], dim: int = -1) -> torch.Tensor:
+    """
+    Reference implementation of the Softmax function using PyTorch's predefined functions.
+    Args:
+        x (torch.Tensor): Input tensor.
+        dim (int): Dimension along which to apply softmax.
+    Returns:
+        torch.Tensor: Tensor after applying Softmax.
+    """
+
+    return [torch.nn.functional.softmax(x, dim=dim) for x in xs]
+
+
+def generate_input(seed: int = None, to_cuda: bool = True) -> List[torch.Tensor]:
+    """
+    Generates random input tensor of the specified shape.
+    Args:
+        seed (int): Random seed for reproducibility.
+        to_cuda (bool): Whether to use GPU (CUDA or ROCm) or CPU.
+    Returns:
+        List[torch.Tensor]: List of randomly generated tensors.
+    """
+    shapes = [(128, 64), (256, 64), (512, 64)]
+
+    # Determine the device
+    if to_cuda:
+        if torch.cuda.is_available():  # Check for NVIDIA GPU
+            device = torch.device("cuda")
+        elif torch.backends.mps.is_available():  # Check for AMD GPU using MPS backend
+            device = torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+            device = torch.device("cpu")
+    else:
+        device = torch.device("cpu")
+
+    if seed is not None:
+        torch.manual_seed(seed)
+
+    tensors = []
+    for shape in shapes:
+        tensors.append(torch.randn(shape, device=device))
+
+    return tensors
+
+
+if __name__ == "__main__":
+    inputs = generate_input(seed=42)
+    for idx, tensor in enumerate(inputs):
+        print(f"Input Tensor {idx + 1} (Shape: {tensor.shape}):\n{tensor}")
diff --git a/examples/softmax_py/submission.py b/examples/softmax_py/submission.py
@@ -0,0 +1,23 @@
+from typing import List
+
+import torch
+
+
+def custom_kernel(xs: List[torch.Tensor], dim: int = -1) -> List[torch.Tensor]:
+    """
+    Custom implementation of the Softmax function.
+    Args:
+        x (torch.Tensor): Input tensor.
+        dim (int): Dimension along which to apply softmax.
+    Returns:
+        torch.Tensor: Tensor after applying Softmax.
+    """
+    res = []
+    for x in xs:
+        # Shift for numerical stability
+        x_shifted = x - torch.max(x, dim=dim, keepdim=True).values
+        exp_x = torch.exp(x_shifted)
+        softmax = exp_x / torch.sum(exp_x, dim=dim, keepdim=True)
+        res.append(softmax)
+
+    return res
diff --git a/examples/thunderkittens_example/reference.cuh b/examples/thunderkittens_example/reference.cuh
@@ -0,0 +1,63 @@
+#ifndef __REFERENCE_CUH__
+#define __REFERENCE_CUH__
+
+#include <tuple>
+#include <vector>
+
+#define N_SIZES 5
+const int Ns[N_SIZES] = {
+    32, 32, 32, 32, 32,
+};
+
+using input_t = std::array<std::vector<float>, N_SIZES>;
+using output_t = input_t;
+
+input_t generate_input() {
+  input_t data;
+
+  for (int i = 0; i < N_SIZES; ++i) {
+    data[i].resize(Ns[i]);
+    for (int j = 0; j < Ns[i]; ++j) {
+      data[i][j] = static_cast<float>(rand()) / RAND_MAX;
+    }
+  }
+
+  return data;
+}
+
+output_t reference(input_t data) {
+  output_t out;
+
+  for (int i = 0; i < N_SIZES; ++i) {
+    out[i].resize(Ns[i]);
+    for (int j = 0; j < Ns[i]; ++j) {
+      out[i][j] = data[i][j] + data[i][j];
+    }
+  }
+
+  return out;
+}
+
+bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
+  // input_t data = generate_input();
+  // output_t reference_out = reference(data);
+  bool same = true;
+
+  for (int i = 0; i < N_SIZES; ++i) {
+    auto ref_ptr = ref[i];
+    auto out_ptr = out[i];
+
+    for (int j = 0; j < Ns[i]; ++j) {
+      if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) {
+        same = false;
+        break;
+      }
+    }
+    if (!same)
+      break;
+  }
+
+  return same;
+}
+
+#endif