-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Add intrinsics for st.bulk instruction #128856
base: main
Are you sure you want to change the base?
[NVPTX] Add intrinsics for st.bulk instruction #128856
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-nvptx Author: Srinivasa Ravi (Wolfram70) ChangesAdds NVVM intrinsics and NVPTX codegen for the PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk Full diff: https://github.com/llvm/llvm-project/pull/128856.diff 3 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index c32bf0318b5d6..bd3b4cdba8f35 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5186,4 +5186,12 @@ foreach cta_group = ["cg1", "cg2"] in {
}
}
+//
+// Bulk store intrinsics
+//
+
+def int_nvvm_st_bulk: Intrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty, llvm_i64_ty], [IntrArgMemOnly, WriteOnly<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
+
+def int_nvvm_st_bulk_shared_cta : Intrinsic<[], [llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty], [IntrArgMemOnly, WriteOnly<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
+
} // let TargetPrefix = "nvvm"
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 0640d25031c6a..09534f1fc0f58 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -7731,3 +7731,17 @@ def tcgen05_fence_after_thread_sync: NVPTXInst<(outs), (ins),
Requires<[hasTcgen05Instructions]>;
} // hasSideEffects
+
+// Bulk store instructions
+
+def INT_NVVM_ST_BULK_GENERIC :
+ NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
+ "st.bulk [$dest_addr], $size, 0;",
+ [(int_nvvm_st_bulk i64:$dest_addr, i64:$size, (i64 0))]>,
+ Requires<[hasSM<100>, hasPTX<86>]>;
+
+def INT_NVVM_ST_BULK_SHARED_CTA:
+ NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size),
+ "st.bulk.shared::cta [$dest_addr], $size, 0;",
+ [(int_nvvm_st_bulk_shared_cta i64:$dest_addr, i64:$size, (i64 0))]>,
+ Requires<[hasSM<100>, hasPTX<86>]>;
diff --git a/llvm/test/CodeGen/NVPTX/st_bulk.ll b/llvm/test/CodeGen/NVPTX/st_bulk.ll
new file mode 100644
index 0000000000000..9d4a425d155e7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/st_bulk.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %}
+
+declare void @llvm.nvvm.st.bulk(ptr addrspace(1), i64, i64)
+define void @st_bulk(ptr addrspace(1) %dest_addr, i64 %size) {
+; CHECK-LABEL: st_bulk(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_param_1];
+; CHECK-NEXT: st.bulk [%rd1], %rd2, 0;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.st.bulk(ptr addrspace(1) %dest_addr, i64 %size, i64 0)
+ ret void
+}
+
+declare void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3), i64, i64)
+define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) {
+; CHECK-LABEL: st_bulk_shared_cta(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_shared_cta_param_1];
+; CHECK-NEXT: st.bulk.shared::cta [%rd1], %rd2, 0;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0)
+ ret void
+}
|
b4696d4
to
7022b16
Compare
Changes LGTM. Please resolve the conflicts so we can get an updated result from the builders. |
7022b16
to
dd96135
Compare
def INT_NVVM_ST_BULK_SHARED_CTA: | ||
NVPTXInst<(outs), (ins Int64Regs:$dest_addr, Int64Regs:$size), | ||
"st.bulk.shared::cta [$dest_addr], $size, 0;", | ||
[(int_nvvm_st_bulk_shared_cta i64:$dest_addr, i64:$size, (i64 0))]>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This may run into problems if we enable "short" pointers to shared memory.
Search for -nvptx-short-ptr
in LLVM tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps this would be a good use case for the ADDR
operand and addr
pattern added in #129102
Adds NVVM intrinsics and NVPTX codegen for the `st.bulk` instruction introduced in ptx8.6 for sm_100. Tests added in `CodeGen/NVPTX/st_bulk.ll` and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk
dd96135
to
18452f0
Compare
Adds NVVM intrinsics and NVPTX codegen for the
st.bulk
instruction introduced in ptx8.6 for sm_100. Tests added inCodeGen/NVPTX/st_bulk.ll
and verified through ptxas 12.8.0.PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk