Skip to content

Commit

Permalink
[NVPTX] Add intrinsics for st.bulk instruction
Browse files Browse the repository at this point in the history
Adds NVVM intrinsics and NVPTX codegen for the `st.bulk` instruction
introduced in ptx8.6 for sm_100. Tests added in `CodeGen/NVPTX/st_bulk.ll`
and verified through ptxas 12.8.0.

PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk
  • Loading branch information
Wolfram70 committed Feb 27, 2025
1 parent e3f5269 commit dd96135
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 0 deletions.
14 changes: 14 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -5261,4 +5261,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
}
}

//
// Bulk store intrinsics
//

def int_nvvm_st_bulk: Intrinsic<[],
[llvm_global_ptr_ty, llvm_i64_ty, llvm_i64_ty],
[IntrArgMemOnly, IntrWriteMem,
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;

def int_nvvm_st_bulk_shared_cta : Intrinsic<[],
[llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty],
[IntrArgMemOnly, IntrWriteMem,
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;

} // let TargetPrefix = "nvvm"
14 changes: 14 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -7816,3 +7816,17 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
}

} // isConvergent

// Bulk store instructions

def INT_NVVM_ST_BULK_GENERIC :
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
"st.bulk [$dest_addr], $size, 0;",
[(int_nvvm_st_bulk i64:$dest_addr, i64:$size, (i64 0))]>,
Requires<[hasSM<100>, hasPTX<86>]>;

def INT_NVVM_ST_BULK_SHARED_CTA:
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
"st.bulk.shared::cta [$dest_addr], $size, 0;",
[(int_nvvm_st_bulk_shared_cta i64:$dest_addr, i64:$size, (i64 0))]>,
Requires<[hasSM<100>, hasPTX<86>]>;
33 changes: 33 additions & 0 deletions llvm/test/CodeGen/NVPTX/st_bulk.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %}

declare void @llvm.nvvm.st.bulk(ptr addrspace(1), i64, i64)
define void @st_bulk(ptr addrspace(1) %dest_addr, i64 %size) {
; CHECK-LABEL: st_bulk(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_param_1];
; CHECK-NEXT: st.bulk [%rd1], %rd2, 0;
; CHECK-NEXT: ret;
call void @llvm.nvvm.st.bulk(ptr addrspace(1) %dest_addr, i64 %size, i64 0)
ret void
}

declare void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3), i64, i64)
define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) {
; CHECK-LABEL: st_bulk_shared_cta(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_shared_cta_param_1];
; CHECK-NEXT: st.bulk.shared::cta [%rd1], %rd2, 0;
; CHECK-NEXT: ret;
call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0)
ret void
}

0 comments on commit dd96135

Please sign in to comment.