Skip to content

Commit

Permalink
few more <1024 impls
Browse files Browse the repository at this point in the history
  • Loading branch information
drisspg committed May 19, 2024
1 parent fc4ded5 commit bf84b72
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 15 deletions.
16 changes: 9 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,15 @@ target_include_directories(SimpleCudaLib PUBLIC src/include)
# Add fmtlib
add_subdirectory(third_party/fmt)

# CUDA Flags
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
# Check if building in Debug mode
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# Add debug-specific flags
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
else()
# Add line info flag only if not building in Debug mode
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
endif()

option (SHOW_PTXAS_INFO "Show ptxas info" OFF)
if(SHOW_PTXAS_INFO)
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -Xptxas -v")
Expand All @@ -46,11 +53,6 @@ foreach(EXAMPLE_SOURCE ${EXAMPLE_SOURCES})
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90a)

# Check if building in Debug mode and add -G flag
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
endif()

# Convert the flags string into a list of flags
separate_arguments(EXTRA_CUDA_FLAGS_LIST UNIX_COMMAND "${EXTRA_CUDA_FLAGS}")

Expand Down
66 changes: 58 additions & 8 deletions examples/chapter10/reduce1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
#include <fmt/core.h>
#include <fmt/ranges.h>

#include <numeric>
#include <cmath>
#include <numeric>
#include <optional>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/host_vector.h>
Expand All @@ -29,6 +30,37 @@ __global__ void Reduce1dInplace(float *input, float *output, const int numel) {
}
}

__global__ void Reduce1dInplaceBetterOrdering(float *input, float *output,
const int numel) {
const int i = threadIdx.x;
for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
if (i < stride) {
input[i] += input[i + stride];
}
__syncthreads();
}

if (threadIdx.x == 0) {
*output = input[0];
}
}

__global__ void Reduce1dShared(float *input, float *output, const int numel) {
const int i = threadIdx.x;
extern __shared__ float shmem[];
for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
if (i < stride) {
shmem[i] = stride == blockDim.x ? input[i] + input[i + stride]
: shmem[i] + shmem[i + stride];
__syncthreads();
}
}

if (threadIdx.x == 0) {
*output = shmem[0];
}
}

float cpp_kernel(std::vector<float> &input) {
const auto n_elements = input.size();
std::vector<float> input_copy(input.size());
Expand All @@ -37,7 +69,8 @@ float cpp_kernel(std::vector<float> &input) {
return out;
}

void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block,
std::optional<size_t> shmem) {
one_d tensor_extents({numel});

HostTensor<float, one_d> input_vec(tensor_extents);
Expand All @@ -49,8 +82,14 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
auto input_vec_d = input_vec.to_device();
auto out_sum_d = out_sum.to_device();

func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
tensor_extents.numel());
if (shmem.has_value()) {
func<<<grid, block, shmem.value()>>>(
input_vec_d.data_ptr(), out_sum_d.data_ptr(), tensor_extents.numel());

} else {
func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
tensor_extents.numel());
}
cudaCheckErrors("kernel launch failure");
cudaDeviceSynchronize();

Expand All @@ -62,7 +101,7 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {
const auto cpp_anwser = cpp_kernel(input_vector);

float diff = fabs(cpp_anwser - host_output_ptr[0]);
if (diff > 1e-3) {
if (diff > 5e-3) {
std::string error_string = "Houston we have a problem!\n";
error_string += fmt::format("Found a deviation of {}\n", diff);
error_string += fmt::format("Cpp anwser: {}, GPU anwser: {}\n", cpp_anwser,
Expand All @@ -75,13 +114,24 @@ void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block) {

int main() {
// Standard Matmul
constexpr int max_length = 1024;
constexpr int block_size = max_length/2;
constexpr int max_length = 2048;
constexpr int block_size = max_length / 2;

dim3 grid(1);
dim3 block(block_size);

Test(Reduce1dInplace, max_length, grid, block);
// Base case bad ordering inplace writes
fmt::print("• Reduced1dInplace Test: ");
Test(Reduce1dInplace, max_length, grid, block, std::nullopt);

// Inplace writes bad ordering
fmt::print("• Reduced1dInplaceBetterOrdering Test: ");
Test(Reduce1dInplaceBetterOrdering, max_length, grid, block, std::nullopt);

// Dynamic shmem version
fmt::print("• Reduce1dShared Test: ");
size_t shmem{block.x * sizeof(float)};
Test(Reduce1dShared, max_length, grid, block, shmem);

// profile the relevant kernels:
// ncu -k "regex:reduce" ./bin/conv1d
Expand Down

0 comments on commit bf84b72

Please sign in to comment.