Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVPTX] Add intrinsics for st.bulk instruction #128856

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -5261,4 +5261,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
}
}

//
// Bulk store intrinsics
//

def int_nvvm_st_bulk: Intrinsic<[],
[llvm_global_ptr_ty, llvm_i64_ty, llvm_i64_ty],
[IntrArgMemOnly, IntrWriteMem,
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;

def int_nvvm_st_bulk_shared_cta : Intrinsic<[],
[llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty],
[IntrArgMemOnly, IntrWriteMem,
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;

} // let TargetPrefix = "nvvm"
20 changes: 20 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -7816,3 +7816,23 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
}

} // isConvergent

// Bulk store instructions

def INT_NVVM_ST_BULK_GENERIC :
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
"st.bulk [$dest_addr], $size, 0;",
[(int_nvvm_st_bulk i64:$dest_addr, i64:$size, (i64 0))]>,
Requires<[hasSM<100>, hasPTX<86>]>;

def INT_NVVM_ST_BULK_SHARED_CTA:
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
"st.bulk.shared::cta [$dest_addr], $size, 0;",
[(int_nvvm_st_bulk_shared_cta i64:$dest_addr, i64:$size, (i64 0))]>,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may run into problems if we enable "short" pointers to shared memory.

Search for -nvptx-short-ptr in LLVM tests

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this would be a good use case for the ADDR operand and addr pattern added in #129102

Requires<[hasSM<100>, hasPTX<86>]>;

def INT_NVVM_ST_BULK_SHARED_CTA_SHARED32:
NVPTXInst<(outs), (ins Int32Regs:$dest_addr, Int64Regs:$size),
"st.bulk.shared::cta [$dest_addr], $size, 0;",
[(int_nvvm_st_bulk_shared_cta i32:$dest_addr, i64:$size, (i64 0))]>,
Requires<[hasSM<100>, hasPTX<86>]>;
46 changes: 46 additions & 0 deletions llvm/test/CodeGen/NVPTX/st_bulk.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %}
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %}

declare void @llvm.nvvm.st.bulk(ptr addrspace(1), i64, i64)
define void @st_bulk(ptr addrspace(1) %dest_addr, i64 %size) {
; CHECK-LABEL: st_bulk(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_param_1];
; CHECK-NEXT: st.bulk [%rd1], %rd2, 0;
; CHECK-NEXT: ret;
call void @llvm.nvvm.st.bulk(ptr addrspace(1) %dest_addr, i64 %size, i64 0)
ret void
}

declare void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3), i64, i64)
define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) {
; CHECK-PTX64-LABEL: st_bulk_shared_cta(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_0];
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [st_bulk_shared_cta_param_1];
; CHECK-PTX64-NEXT: st.bulk.shared::cta [%rd1], %rd2, 0;
; CHECK-PTX64-NEXT: ret;
;
; CHECK-PTX-SHARED32-LABEL: st_bulk_shared_cta(
; CHECK-PTX-SHARED32: {
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<2>;
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX-SHARED32-EMPTY:
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [st_bulk_shared_cta_param_0];
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_1];
; CHECK-PTX-SHARED32-NEXT: st.bulk.shared::cta [%r1], %rd1, 0;
; CHECK-PTX-SHARED32-NEXT: ret;
call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0)
ret void
}