-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Add intrinsics for redux.sync f32 instructions #126664
base: main
Are you sure you want to change the base?
[NVPTX] Add intrinsics for redux.sync f32 instructions #126664
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-clang Author: Srinivasa Ravi (Wolfram70) ChangesAdds NVVM intrinsics, NVPTX codegen and Clang builtins for PTX Spec Reference: Full diff: https://github.com/llvm/llvm-project/pull/126664.diff 5 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 9d24a992563a450..327dc88cffdb4e6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -669,6 +669,14 @@ def __nvvm_redux_sync_umax : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, in
def __nvvm_redux_sync_and : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
def __nvvm_redux_sync_xor : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
def __nvvm_redux_sync_or : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_fmin : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>;
// Membar
diff --git a/clang/test/CodeGenCUDA/redux-builtins.cu b/clang/test/CodeGenCUDA/redux-builtins.cu
index a6c83945ab156e9..049f935e3787b79 100644
--- a/clang/test/CodeGenCUDA/redux-builtins.cu
+++ b/clang/test/CodeGenCUDA/redux-builtins.cu
@@ -1,11 +1,13 @@
-// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
-// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx86" "-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx86" "-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
-// CHECK: define{{.*}} void @_Z6kernelPi(ptr noundef %out)
-__attribute__((global)) void kernel(int *out) {
+// CHECK: define{{.*}} void @_Z6kernelPiPf(ptr noundef %out, ptr noundef %out_f)
+__attribute__((global)) void kernel(int *out, float* out_f) {
int a = 1;
unsigned int b = 5;
+ float c = 3.0;
int i = 0;
+ int j = 0;
out[i++] = __nvvm_redux_sync_add(a, 0xFF);
// CHECK: call i32 @llvm.nvvm.redux.sync.add
@@ -42,6 +44,30 @@ __attribute__((global)) void kernel(int *out) {
out[i++] = __nvvm_redux_sync_or(b, 0xFF);
// CHECK: call i32 @llvm.nvvm.redux.sync.or
+
+ out_f[j++] = __nvvm_redux_sync_fmin(c, 0xFF);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmin
+
+ out_f[j++] = __nvvm_redux_sync_fmin_abs(c, 0xFF);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs
+
+ out_f[j++] = __nvvm_redux_sync_fmin_NaN(c, 0xF0);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.NaN
+
+ out_f[j++] = __nvvm_redux_sync_fmin_abs_NaN(c, 0x0F);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs.NaN
+
+ out_f[j++] = __nvvm_redux_sync_fmax(c, 0xFF);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmax
+
+ out_f[j++] = __nvvm_redux_sync_fmax_abs(c, 0x01);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs
+
+ out_f[j++] = __nvvm_redux_sync_fmax_NaN(c, 0xF1);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.NaN
+
+ out_f[j++] = __nvvm_redux_sync_fmax_abs_NaN(c, 0x10);
+ // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs.NaN
// CHECK: ret void
}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index f299a145ac73b12..0ceb64d506243c5 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4823,6 +4823,18 @@ def int_nvvm_redux_sync_xor : ClangBuiltin<"__nvvm_redux_sync_xor">,
def int_nvvm_redux_sync_or : ClangBuiltin<"__nvvm_redux_sync_or">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
+
+// redux.sync.{min/max}.{abs}.{nan}.f32 dst, src, membermask;
+foreach binOp = ["min", "max"] in {
+ foreach abs = ["", "_abs"] in {
+ foreach nan = ["", "_NaN"] in {
+ def int_nvvm_redux_sync_f # binOp # abs # nan :
+ ClangBuiltin<!strconcat("__nvvm_redux_sync_f", binOp, abs, nan)>,
+ Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty],
+ [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
+ }
+ }
+}
//
// WGMMA fence instructions
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 5331f36ad09997f..45603c929ea92e1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -328,6 +328,24 @@ defm REDUX_SYNC_AND : REDUX_SYNC<"and", "b32", int_nvvm_redux_sync_and>;
defm REDUX_SYNC_XOR : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor>;
defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>;
+multiclass REDUX_SYNC_F<string BinOp, string ABS, string NAN, Intrinsic Intrin> {
+ def : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$src, Int32Regs:$mask),
+ "redux.sync." # !tolower(BinOp) # !subst("_", ".", ABS) # !subst("_", ".", NAN) # ".f32 $dst, $src, $mask;",
+ [(set f32:$dst, (Intrin f32:$src, Int32Regs:$mask))]>,
+ Requires<[hasPTX<86>, hasSM100a]>;
+
+}
+
+defm REDUX_SYNC_FMIN : REDUX_SYNC_F<"min", "", "", int_nvvm_redux_sync_fmin>;
+defm REDUX_SYNC_FMIN_ABS : REDUX_SYNC_F<"min", "_abs", "", int_nvvm_redux_sync_fmin_abs>;
+defm REDUX_SYNC_FMIN_NAN: REDUX_SYNC_F<"min", "", "_NaN", int_nvvm_redux_sync_fmin_NaN>;
+defm REDUX_SYNC_FMIN_ABS_NAN: REDUX_SYNC_F<"min", "_abs", "_NaN", int_nvvm_redux_sync_fmin_abs_NaN>;
+defm REDUX_SYNC_FMAX : REDUX_SYNC_F<"max", "", "", int_nvvm_redux_sync_fmax>;
+defm REDUX_SYNC_FMAX_ABS : REDUX_SYNC_F<"max", "_abs", "", int_nvvm_redux_sync_fmax_abs>;
+defm REDUX_SYNC_FMAX_NAN: REDUX_SYNC_F<"max", "", "_NaN", int_nvvm_redux_sync_fmax_NaN>;
+defm REDUX_SYNC_FMAX_ABS_NAN: REDUX_SYNC_F<"max", "_abs", "_NaN", int_nvvm_redux_sync_fmax_abs_NaN>;
+
} // isConvergent = true
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/redux-sync.ll b/llvm/test/CodeGen/NVPTX/redux-sync.ll
index bd1c7f5c12e9464..e37e2fe08535e7f 100644
--- a/llvm/test/CodeGen/NVPTX/redux-sync.ll
+++ b/llvm/test/CodeGen/NVPTX/redux-sync.ll
@@ -1,66 +1,268 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
-; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s
+; RUN: %if ptxas-11.0 && ! ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
declare i32 @llvm.nvvm.redux.sync.umin(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_min_u32
define i32 @redux_sync_min_u32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.min.u32
+; CHECK-LABEL: redux_sync_min_u32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_min_u32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_min_u32_param_1];
+; CHECK-NEXT: redux.sync.min.u32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.umin(i32 %src, i32 %mask)
ret i32 %val
}
declare i32 @llvm.nvvm.redux.sync.umax(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_max_u32
define i32 @redux_sync_max_u32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.max.u32
+; CHECK-LABEL: redux_sync_max_u32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_max_u32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_max_u32_param_1];
+; CHECK-NEXT: redux.sync.max.u32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.umax(i32 %src, i32 %mask)
ret i32 %val
}
declare i32 @llvm.nvvm.redux.sync.add(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_add_s32
define i32 @redux_sync_add_s32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.add.s32
+; CHECK-LABEL: redux_sync_add_s32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_add_s32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_add_s32_param_1];
+; CHECK-NEXT: redux.sync.add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.add(i32 %src, i32 %mask)
ret i32 %val
}
declare i32 @llvm.nvvm.redux.sync.min(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_min_s32
define i32 @redux_sync_min_s32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.min.s32
+; CHECK-LABEL: redux_sync_min_s32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_min_s32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_min_s32_param_1];
+; CHECK-NEXT: redux.sync.min.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.min(i32 %src, i32 %mask)
ret i32 %val
}
declare i32 @llvm.nvvm.redux.sync.max(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_max_s32
define i32 @redux_sync_max_s32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.max.s32
+; CHECK-LABEL: redux_sync_max_s32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_max_s32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_max_s32_param_1];
+; CHECK-NEXT: redux.sync.max.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.max(i32 %src, i32 %mask)
ret i32 %val
}
declare i32 @llvm.nvvm.redux.sync.and(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_and_b32
define i32 @redux_sync_and_b32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.and.b32
+; CHECK-LABEL: redux_sync_and_b32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_and_b32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_and_b32_param_1];
+; CHECK-NEXT: redux.sync.and.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.and(i32 %src, i32 %mask)
ret i32 %val
}
declare i32 @llvm.nvvm.redux.sync.xor(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_xor_b32
define i32 @redux_sync_xor_b32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.xor.b32
+; CHECK-LABEL: redux_sync_xor_b32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_xor_b32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_xor_b32_param_1];
+; CHECK-NEXT: redux.sync.xor.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.xor(i32 %src, i32 %mask)
ret i32 %val
}
declare i32 @llvm.nvvm.redux.sync.or(i32, i32)
-; CHECK-LABEL: .func{{.*}}redux_sync_or_b32
define i32 @redux_sync_or_b32(i32 %src, i32 %mask) {
- ; CHECK: redux.sync.or.b32
+; CHECK-LABEL: redux_sync_or_b32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_or_b32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [redux_sync_or_b32_param_1];
+; CHECK-NEXT: redux.sync.or.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.nvvm.redux.sync.or(i32 %src, i32 %mask)
ret i32 %val
}
+
+declare float @llvm.nvvm.redux.sync.fmin(float, i32)
+define float @redux_sync_fmin(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmin(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_param_1];
+; CHECK-NEXT: redux.sync.min.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmin(float %src, i32 %mask)
+ ret float %val
+}
+
+declare float @llvm.nvvm.redux.sync.fmin.abs(float, i32)
+define float @redux_sync_fmin_abs(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmin_abs(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_abs_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_abs_param_1];
+; CHECK-NEXT: redux.sync.min.abs.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmin.abs(float %src, i32 %mask)
+ ret float %val
+}
+
+declare float @llvm.nvvm.redux.sync.fmin.NaN(float, i32)
+define float @redux_sync_fmin_NaN(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmin_NaN(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_NaN_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_NaN_param_1];
+; CHECK-NEXT: redux.sync.min.NaN.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmin.NaN(float %src, i32 %mask)
+ ret float %val
+}
+
+declare float @llvm.nvvm.redux.sync.fmin.abs.NaN(float, i32)
+define float @redux_sync_fmin_abs_NaN(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmin_abs_NaN(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_abs_NaN_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_abs_NaN_param_1];
+; CHECK-NEXT: redux.sync.min.abs.NaN.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmin.abs.NaN(float %src, i32 %mask)
+ ret float %val
+}
+
+declare float @llvm.nvvm.redux.sync.fmax(float, i32)
+define float @redux_sync_fmax(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmax(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_param_1];
+; CHECK-NEXT: redux.sync.max.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmax(float %src, i32 %mask)
+ ret float %val
+}
+
+declare float @llvm.nvvm.redux.sync.fmax.abs(float, i32)
+define float @redux_sync_fmax_abs(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmax_abs(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_abs_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_abs_param_1];
+; CHECK-NEXT: redux.sync.max.abs.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmax.abs(float %src, i32 %mask)
+ ret float %val
+}
+
+declare float @llvm.nvvm.redux.sync.fmax.NaN(float, i32)
+define float @redux_sync_fmax_NaN(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmax_NaN(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_NaN_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_NaN_param_1];
+; CHECK-NEXT: redux.sync.max.NaN.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmax.NaN(float %src, i32 %mask)
+ ret float %val
+}
+
+declare float @llvm.nvvm.redux.sync.fmax.abs.NaN(float, i32)
+define float @redux_sync_fmax_abs_NaN(float %src, i32 %mask) {
+; CHECK-LABEL: redux_sync_fmax_abs_NaN(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_abs_NaN_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_abs_NaN_param_1];
+; CHECK-NEXT: redux.sync.max.abs.NaN.f32 %f2, %f1, %r1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
+ %val = call float @llvm.nvvm.redux.sync.fmax.abs.NaN(float %src, i32 %mask)
+ ret float %val
+}
|
multiclass REDUX_SYNC_F<string BinOp, string ABS, string NAN, Intrinsic Intrin> { | ||
def : NVPTXInst<(outs Float32Regs:$dst), | ||
(ins Float32Regs:$src, Int32Regs:$mask), | ||
"redux.sync." # !tolower(BinOp) # !subst("_", ".", ABS) # !subst("_", ".", NAN) # ".f32 $dst, $src, $mask;", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we do not need tolower
defm REDUX_SYNC_FMAX : REDUX_SYNC_F<"max", "", "", int_nvvm_redux_sync_fmax>; | ||
defm REDUX_SYNC_FMAX_ABS : REDUX_SYNC_F<"max", "_abs", "", int_nvvm_redux_sync_fmax_abs>; | ||
defm REDUX_SYNC_FMAX_NAN: REDUX_SYNC_F<"max", "", "_NaN", int_nvvm_redux_sync_fmax_NaN>; | ||
defm REDUX_SYNC_FMAX_ABS_NAN: REDUX_SYNC_F<"max", "_abs", "_NaN", int_nvvm_redux_sync_fmax_abs_NaN>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe, we could easily construct the intrinsic from a cast in the multiclass itself.
// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx86" "-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx86" "-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let us keep the existing file and the tests intact. We need them for ptx70/sm_80.
Can we add another redux-f32-builtins.cu file with only the new additions from this change?
d11cdd7
to
062a48e
Compare
Adds NVVM intrinsics and NVPTX codegen for redux.sync f32 instructions introduced in ptx8.6 for sm_100a. Tests added in CodeGen/NVPTX/redux-sync.ll and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync
062a48e
to
88e076b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest revision looks good to me.
Adds NVVM intrinsics, NVPTX codegen and Clang builtins for
redux.sync
f32 instructions introduced in ptx8.6 for sm_100a.Tests added in
CodeGen/NVPTX/redux-sync.ll
andCodeGenCUDA/redux-builtins.cu
and verified through ptxas 12.8.0.PTX Spec Reference:
https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync