few more <1024 impls

drisspg · May 19, 2024 · bf84b72 · bf84b72
1 parent fc4ded5
commit bf84b72
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 15 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -27,8 +27,15 @@ target_include_directories(SimpleCudaLib PUBLIC src/include)
 # Add fmtlib
 add_subdirectory(third_party/fmt)
 
-# CUDA Flags
-set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
+# Check if building in Debug mode
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  # Add debug-specific flags
+  set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
+else()
+  # Add line info flag only if not building in Debug mode
+  set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
+endif()
+
 option (SHOW_PTXAS_INFO "Show ptxas info" OFF)
 if(SHOW_PTXAS_INFO)
     set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -Xptxas -v")
@@ -46,11 +53,6 @@ foreach(EXAMPLE_SOURCE ${EXAMPLE_SOURCES})
     set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
     set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90a)
 
-    # Check if building in Debug mode and add -G flag
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-      set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
-    endif()
-
     # Convert the flags string into a list of flags
     separate_arguments(EXTRA_CUDA_FLAGS_LIST UNIX_COMMAND "${EXTRA_CUDA_FLAGS}")
 

diff --git a/examples/chapter10/reduce1d.cu b/examples/chapter10/reduce1d.cu
@@ -5,8 +5,9 @@
 #include <fmt/core.h>
 #include <fmt/ranges.h>
 
-#include <numeric>
 #include <cmath>
+#include <numeric>
+#include <optional>
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/host_vector.h>
@@ -29,6 +30,37 @@ __global__ void Reduce1dInplace(float *input, float *output, const int numel) {
   }
 }
 
+__global__ void Reduce1dInplaceBetterOrdering(float *input, float *output,
+                                              const int numel) {
+  const int i = threadIdx.x;
+  for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
+    if (i < stride) {
+      input[i] += input[i + stride];
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0) {
+    *output = input[0];
+  }
+}
+
+__global__ void Reduce1dShared(float *input, float *output, const int numel) {
+  const int i = threadIdx.x;
+  extern __shared__ float shmem[];
+  for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
+    if (i < stride) {
+      shmem[i] = stride == blockDim.x ? input[i] + input[i + stride]
+                                      : shmem[i] + shmem[i + stride];
+      __syncthreads();
+    }
+  }
+
+  if (threadIdx.x == 0) {
+    *output = shmem[0];
+  }
+}
+
 float cpp_kernel(std::vector<float> &input) {
   const auto n_elements = input.size();
   std::vector<float> input_copy(input.size());
@@ -37,7 +69,8 @@ float cpp_kernel(std::vector<float> &input) {
   return out;
 }
 
-void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
+void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block,
+          std::optional<size_t> shmem) {
   one_d tensor_extents({numel});
 
   HostTensor<float, one_d> input_vec(tensor_extents);
@@ -49,8 +82,14 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
   auto input_vec_d = input_vec.to_device();
   auto out_sum_d = out_sum.to_device();
 
-  func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
-                        tensor_extents.numel());
+  if (shmem.has_value()) {
+    func<<<grid, block, shmem.value()>>>(
+        input_vec_d.data_ptr(), out_sum_d.data_ptr(), tensor_extents.numel());
+
+  } else {
+    func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
+                          tensor_extents.numel());
+  }
   cudaCheckErrors("kernel launch failure");
   cudaDeviceSynchronize();
 
@@ -62,7 +101,7 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
   const auto cpp_anwser = cpp_kernel(input_vector);
 
   float diff = fabs(cpp_anwser - host_output_ptr[0]);
-  if (diff > 1e-3) {
+  if (diff > 5e-3) {
     std::string error_string = "Houston we have a problem!\n";
     error_string += fmt::format("Found a deviation of {}\n", diff);
     error_string += fmt::format("Cpp anwser: {}, GPU anwser: {}\n", cpp_anwser,
@@ -75,13 +114,24 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
 
 int main() {
   // Standard Matmul
-  constexpr int max_length = 1024;
-  constexpr int block_size = max_length/2;
+  constexpr int max_length = 2048;
+  constexpr int block_size = max_length / 2;
 
   dim3 grid(1);
   dim3 block(block_size);
 
-  Test(Reduce1dInplace, max_length, grid, block);
+  // Base case bad ordering inplace writes
+  fmt::print("• Reduced1dInplace Test: ");
+  Test(Reduce1dInplace, max_length, grid, block, std::nullopt);
+
+  // Inplace writes bad ordering
+  fmt::print("• Reduced1dInplaceBetterOrdering Test: ");
+  Test(Reduce1dInplaceBetterOrdering, max_length, grid, block, std::nullopt);
+
+  // Dynamic shmem version
+  fmt::print("• Reduce1dShared Test: ");
+  size_t shmem{block.x * sizeof(float)};
+  Test(Reduce1dShared, max_length, grid, block, shmem);
 
   // profile the relevant kernels:
   // ncu -k "regex:reduce" ./bin/conv1d