-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[NVPTX] Fix lowering of i1 SETCC #115035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Fix lowering of i1 SETCC #115035
Conversation
@llvm/pr-subscribers-llvm-selectiondag Author: Alex MacLean (AlexMaclean) Changesfixes #58428 Full diff: https://github.com/llvm/llvm-project/pull/115035.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 42232bd195a651..a599328a0e5be2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18723,6 +18723,9 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
+ if (!TLI.isOperationLegal(ISD::SETCC, Op0.getValueType()))
+ return SDValue();
+
if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
bool Equal = false;
// (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d3bf0ecfe2cc92..a74e9dcccdd770 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -667,6 +667,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
setTruncStoreAction(VT, MVT::i1, Expand);
}
+ setOperationAction(ISD::SETCC, MVT::i1, Custom);
// expand extload of vector of integers.
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
@@ -2666,6 +2667,46 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
}
}
+// Lowers SETCC nodes that aren't directly supported by our arch.
+SDValue NVPTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue L = Op->getOperand(0);
+ SDValue R = Op->getOperand(1);
+
+ if (L.getValueType() != MVT::i1)
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue Ret;
+ switch (cast<CondCodeSDNode>(Op->getOperand(2))->get()) {
+ default:
+ llvm_unreachable("Unknown integer setcc!");
+ case ISD::SETEQ: // X == Y -> ~(X^Y)
+ Ret = DAG.getNOT(DL, DAG.getNode(ISD::XOR, DL, MVT::i1, L, R), MVT::i1);
+ break;
+ case ISD::SETNE: // X != Y --> (X^Y)
+ Ret = DAG.getNode(ISD::XOR, DL, MVT::i1, L, R);
+ break;
+ case ISD::SETGT: // X >s Y --> X == 0 & Y == 1 --> ~X & Y
+ case ISD::SETULT: // X <u Y --> X == 0 & Y == 1 --> ~X & Y
+ Ret = DAG.getNode(ISD::AND, DL, MVT::i1, R, DAG.getNOT(DL, L, MVT::i1));
+ break;
+ case ISD::SETLT: // X <s Y --> X == 1 & Y == 0 --> ~Y & X
+ case ISD::SETUGT: // X >u Y --> X == 1 & Y == 0 --> ~Y & X
+ Ret = DAG.getNode(ISD::AND, DL, MVT::i1, L, DAG.getNOT(DL, R, MVT::i1));
+ break;
+ case ISD::SETULE: // X <=u Y --> X == 0 | Y == 1 --> ~X | Y
+ case ISD::SETGE: // X >=s Y --> X == 0 | Y == 1 --> ~X | Y
+ Ret = DAG.getNode(ISD::OR, DL, MVT::i1, R, DAG.getNOT(DL, L, MVT::i1));
+ break;
+ case ISD::SETUGE: // X >=u Y --> X == 1 | Y == 0 --> ~Y | X
+ case ISD::SETLE: // X <=s Y --> X == 1 | Y == 0 --> ~Y | X
+ Ret = DAG.getNode(ISD::OR, DL, MVT::i1, L, DAG.getNOT(DL, R, MVT::i1));
+ break;
+ }
+
+ return DAG.getZExtOrTrunc(Ret, DL, Op.getValueType());
+}
+
/// If the types match, convert the generic copysign to the NVPTXISD version,
/// otherwise bail ensuring that mismatched cases are properly expaned.
SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
@@ -2919,6 +2960,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTORE(Op, DAG);
case ISD::LOAD:
return LowerLOAD(Op, DAG);
+ case ISD::SETCC:
+ return LowerSETCC(Op, DAG);
case ISD::SHL_PARTS:
return LowerShiftLeftParts(Op, DAG);
case ISD::SRA_PARTS:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index c8b589ae39413e..b1bb9090464ac4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -628,6 +628,8 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/NVPTX/i1-icmp.ll b/llvm/test/CodeGen/NVPTX/i1-icmp.ll
new file mode 100644
index 00000000000000..db9ae6541b87ae
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i1-icmp.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx-nvidia-cuda"
+
+define i32 @icmp_i1_eq(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_eq_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_eq_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: xor.pred %p3, %p1, %p2;
+; CHECK-NEXT: @%p3 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB0_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp eq i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_ne(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ne_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ne_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: xor.pred %p3, %p1, %p2;
+; CHECK-NEXT: not.pred %p4, %p3;
+; CHECK-NEXT: @%p4 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB1_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp ne i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_sgt(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sgt_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sgt_param_1];
+; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
+; CHECK-NEXT: or.pred %p3, %p1, %p2;
+; CHECK-NEXT: @%p3 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB2_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp sgt i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_slt(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_slt_param_0];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_slt_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: or.pred %p3, %p2, %p1;
+; CHECK-NEXT: @%p3 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB3_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp slt i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_sge(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sge_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sge_param_1];
+; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
+; CHECK-NEXT: and.pred %p3, %p1, %p2;
+; CHECK-NEXT: @%p3 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB4_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp sge i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_sle(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_sle(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sle_param_0];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sle_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: and.pred %p3, %p2, %p1;
+; CHECK-NEXT: @%p3 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB5_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp sle i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
|
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) Changesfixes #58428 Full diff: https://github.com/llvm/llvm-project/pull/115035.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 42232bd195a651..a599328a0e5be2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18723,6 +18723,9 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
+ if (!TLI.isOperationLegal(ISD::SETCC, Op0.getValueType()))
+ return SDValue();
+
if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
bool Equal = false;
// (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d3bf0ecfe2cc92..a74e9dcccdd770 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -667,6 +667,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
setTruncStoreAction(VT, MVT::i1, Expand);
}
+ setOperationAction(ISD::SETCC, MVT::i1, Custom);
// expand extload of vector of integers.
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
@@ -2666,6 +2667,46 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
}
}
+// Lowers SETCC nodes that aren't directly supported by our arch.
+SDValue NVPTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue L = Op->getOperand(0);
+ SDValue R = Op->getOperand(1);
+
+ if (L.getValueType() != MVT::i1)
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue Ret;
+ switch (cast<CondCodeSDNode>(Op->getOperand(2))->get()) {
+ default:
+ llvm_unreachable("Unknown integer setcc!");
+ case ISD::SETEQ: // X == Y -> ~(X^Y)
+ Ret = DAG.getNOT(DL, DAG.getNode(ISD::XOR, DL, MVT::i1, L, R), MVT::i1);
+ break;
+ case ISD::SETNE: // X != Y --> (X^Y)
+ Ret = DAG.getNode(ISD::XOR, DL, MVT::i1, L, R);
+ break;
+ case ISD::SETGT: // X >s Y --> X == 0 & Y == 1 --> ~X & Y
+ case ISD::SETULT: // X <u Y --> X == 0 & Y == 1 --> ~X & Y
+ Ret = DAG.getNode(ISD::AND, DL, MVT::i1, R, DAG.getNOT(DL, L, MVT::i1));
+ break;
+ case ISD::SETLT: // X <s Y --> X == 1 & Y == 0 --> ~Y & X
+ case ISD::SETUGT: // X >u Y --> X == 1 & Y == 0 --> ~Y & X
+ Ret = DAG.getNode(ISD::AND, DL, MVT::i1, L, DAG.getNOT(DL, R, MVT::i1));
+ break;
+ case ISD::SETULE: // X <=u Y --> X == 0 | Y == 1 --> ~X | Y
+ case ISD::SETGE: // X >=s Y --> X == 0 | Y == 1 --> ~X | Y
+ Ret = DAG.getNode(ISD::OR, DL, MVT::i1, R, DAG.getNOT(DL, L, MVT::i1));
+ break;
+ case ISD::SETUGE: // X >=u Y --> X == 1 | Y == 0 --> ~Y | X
+ case ISD::SETLE: // X <=s Y --> X == 1 | Y == 0 --> ~Y | X
+ Ret = DAG.getNode(ISD::OR, DL, MVT::i1, L, DAG.getNOT(DL, R, MVT::i1));
+ break;
+ }
+
+ return DAG.getZExtOrTrunc(Ret, DL, Op.getValueType());
+}
+
/// If the types match, convert the generic copysign to the NVPTXISD version,
/// otherwise bail ensuring that mismatched cases are properly expaned.
SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
@@ -2919,6 +2960,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTORE(Op, DAG);
case ISD::LOAD:
return LowerLOAD(Op, DAG);
+ case ISD::SETCC:
+ return LowerSETCC(Op, DAG);
case ISD::SHL_PARTS:
return LowerShiftLeftParts(Op, DAG);
case ISD::SRA_PARTS:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index c8b589ae39413e..b1bb9090464ac4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -628,6 +628,8 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/NVPTX/i1-icmp.ll b/llvm/test/CodeGen/NVPTX/i1-icmp.ll
new file mode 100644
index 00000000000000..db9ae6541b87ae
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i1-icmp.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx-nvidia-cuda"
+
+define i32 @icmp_i1_eq(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_eq_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_eq_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: xor.pred %p3, %p1, %p2;
+; CHECK-NEXT: @%p3 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB0_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp eq i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_ne(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ne_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ne_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: xor.pred %p3, %p1, %p2;
+; CHECK-NEXT: not.pred %p4, %p3;
+; CHECK-NEXT: @%p4 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB1_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp ne i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_sgt(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sgt_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sgt_param_1];
+; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
+; CHECK-NEXT: or.pred %p3, %p1, %p2;
+; CHECK-NEXT: @%p3 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB2_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp sgt i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_slt(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_slt_param_0];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_slt_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: or.pred %p3, %p2, %p1;
+; CHECK-NEXT: @%p3 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB3_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp slt i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_sge(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sge_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sge_param_1];
+; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
+; CHECK-NEXT: and.pred %p3, %p1, %p2;
+; CHECK-NEXT: @%p3 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB4_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp sge i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
+define i32 @icmp_i1_sle(i32 %a, i32 %b) {
+; CHECK-LABEL: icmp_i1_sle(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sle_param_0];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
+; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sle_param_1];
+; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
+; CHECK-NEXT: and.pred %p3, %p2, %p1;
+; CHECK-NEXT: @%p3 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %bb1
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB5_2: // %bb2
+; CHECK-NEXT: mov.b32 %r3, 127;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %p1 = icmp sgt i32 %a, 1
+ %p2 = icmp sgt i32 %b, 1
+ %c = icmp sle i1 %p1, %p2
+ br i1 %c, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 127
+}
+
|
@@ -2666,6 +2667,46 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, | |||
} | |||
} | |||
|
|||
// Lowers SETCC nodes that aren't directly supported by our arch. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should use setCondCodeAction and not reinvent all of this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm Can you please explain a little further how I can use setCondCodeAction to avoid all of this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's an additional legality table because the single type based legality rules are insufficient for compares. It's the same as setting expand / custom / promote, except for the specific condition code and type
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, this table seems to be used inconsistently, does the type represent the value which is produced by the SETCC node or the types of the values which are being compared?
https://github.com/llvm/llvm-project/blob/c1cec8c/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp#L8798
vs.
https://github.com/llvm/llvm-project/blob/c1cec8c/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp#L4994
Also what is the advantage of using this table over the current approach? What will I be able to avoid reinventing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The types being compared.
You are reinventing condition code legalization here:
bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, |
And generic code can no longer accurately query which conditions are legal, which is why you have the hack to check if SETCC is legal
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I'm still confused. I don't see anything like the boolean expansion I'm using for i1 in the LegalizeSetCCCondCode function.
And generic code can no longer accurately query which conditions are legal, which is why you have the hack to check if SETCC is legal
I think I'm going to need to add a check for legality to the dag combine no matter what, I suppose I can check this table instead.
Also, is existing code like this incorrect?
isCondCodeLegal(NewCC, VT.getSimpleVT())) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, I've update the code to use the CondCode table. Not sure if this is what you had in mind though, I don't think it has made the implementation significantly simpler. I also did have to update a couple uses of the table which seemed incorrect.
@arsenm does this look better?
How much of a difference would it make on the SASS level if we were to just extend i1 arguments to i32 and then just run E.g. setp.lt(i1 a, i1 b) -> |
The sass will likely be much worse in most cases if we add the selp instructions in here. |
You're right. At least, ptxas-11.x does not optimize those selects: https://godbolt.org/z/q99cvccfq One observation is that under the hood,
|
b167dbe
to
9535ee8
Compare
@arsenm ping for your thoughts on these discussions |
Ensure that all uses of CondCodeAction table are checking the compared types, not the produced type. This is a prerequisite to landing #115035
9535ee8
to
593e953
Compare
define i32 @icmp_i1_eq(i32 %a, i32 %b) { | ||
; CHECK-LABEL: icmp_i1_eq( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .pred %p<4>; | ||
; CHECK-NEXT: .reg .b32 %r<5>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_eq_param_0]; | ||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; | ||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_eq_param_1]; | ||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; | ||
; CHECK-NEXT: xor.pred %p3, %p1, %p2; | ||
; CHECK-NEXT: @%p3 bra $L__BB0_2; | ||
; CHECK-NEXT: // %bb.1: // %bb1 | ||
; CHECK-NEXT: mov.b32 %r4, 1; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4; | ||
; CHECK-NEXT: ret; | ||
; CHECK-NEXT: $L__BB0_2: // %bb2 | ||
; CHECK-NEXT: mov.b32 %r3, 127; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3; | ||
; CHECK-NEXT: ret; | ||
%p1 = icmp sgt i32 %a, 1 | ||
%p2 = icmp sgt i32 %b, 1 | ||
%c = icmp eq i1 %p1, %p2 | ||
br i1 %c, label %bb1, label %bb2 | ||
bb1: | ||
ret i32 1 | ||
bb2: | ||
ret i32 127 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we refactor this to:
define i1 @icmp_i1_eq(i1 %a, i1 %b) {
%c = icmp eq i1 %a, %b
ret i1 %c
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, setcc used by a brcond is handled differently and that is part of the problem.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And why not
define i32 @f(i1 %0, i1 %1) {
BB:
%C = icmp ugt i1 %1, %0
br i1 %C, label %BB1, label %BB2
BB1:
ret i32 1
BB2:
ret i32 127
}
like in the reproducer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall, LGTM :)
define i32 @icmp_i1_eq(i32 %a, i32 %b) { | ||
; CHECK-LABEL: icmp_i1_eq( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .pred %p<4>; | ||
; CHECK-NEXT: .reg .b32 %r<5>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_eq_param_0]; | ||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; | ||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_eq_param_1]; | ||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; | ||
; CHECK-NEXT: xor.pred %p3, %p1, %p2; | ||
; CHECK-NEXT: @%p3 bra $L__BB0_2; | ||
; CHECK-NEXT: // %bb.1: // %bb1 | ||
; CHECK-NEXT: mov.b32 %r4, 1; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4; | ||
; CHECK-NEXT: ret; | ||
; CHECK-NEXT: $L__BB0_2: // %bb2 | ||
; CHECK-NEXT: mov.b32 %r3, 127; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3; | ||
; CHECK-NEXT: ret; | ||
%p1 = icmp sgt i32 %a, 1 | ||
%p2 = icmp sgt i32 %b, 1 | ||
%c = icmp eq i1 %p1, %p2 | ||
br i1 %c, label %bb1, label %bb2 | ||
bb1: | ||
ret i32 1 | ||
bb2: | ||
ret i32 127 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And why not
define i32 @f(i1 %0, i1 %1) {
BB:
%C = icmp ugt i1 %1, %0
br i1 %C, label %BB1, label %BB2
BB1:
ret i32 1
BB2:
ret i32 127
}
like in the reproducer?
default: | ||
llvm_unreachable("Unknown integer setcc!"); | ||
case ISD::SETEQ: // X == Y --> ~(X ^ Y) | ||
Ret = DAG.getNOT(dl, DAG.getNode(ISD::XOR, dl, MVT::i1, LHS, RHS), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this need to be guarded by TLI.isLegal(ISD::XOR)
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer not to. NVPTX is the only target using these expansions and we don't have any way to expand if not. I don't see what the point of adding them would be.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My question boils down to: will xor
be legalized if it's generated for a target that doesn't support it? In other words, can Expand
generate illegal code?
My assumption was no, but based on your answer, I'm gathering that Expand
can generate illegal code and rely on the legalizer to legalize it. Is that correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the issue here is that there's more than one possible interpretation of "lower SetCC(i1) differently". NVPTX can do it via logical ops on predicates, other targets may not. On other targets it may be something else. Whoever would use "setCondCodeAction(...,Expand)" would have no idea what it would expand into. This should at least be documented. If/when we would encounter a target which would have a different way of expanding setcc, we would probably need a more nuanced hint to tell LLVM which flavor we actually want.
I think at the moment only Hexagon uses setCondCodeAction(...,i1, Expand)
. It would be good to check what effect this change has on them.
return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1, | ||
Equal ? ISD::SETEQ : ISD::SETNE); | ||
const ISD::CondCode CC = Equal ? ISD::SETEQ : ISD::SETNE; | ||
if (!LegalOperations || TLI.isCondCodeLegal(CC, Op0.getSimpleValueType())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could definitely use a comment. AFAICT what we're doing here is "combine into setcc(NE/EQ) only if we know they are legal, which leaves xor(i1) untouched and allows us to use eventually use logic ops on predicates" .
Do we actually need to add this special case here?
It appears that we do manage to lower setcc x, y, eq
as xor.pred
already : https://godbolt.org/z/cMx9d8P33
What happens if we remove the change here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Presumably it will infinite loop the combiner without this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. That should also be mentioned in the comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, we get stuck in an infinite loop: legalization converts the setcc into an xor and then dag combiner converts it back to a setcc over and over again. I've added a comment explaining.
It appears that we do manage to lower setcc x, y, eq as xor.pred already : https://godbolt.org/z/cMx9d8P33
Good catch, this is the result of some old table-gen patterns which handled eq
and ne
. I've removed these as the legalization approach handles these cases and is more powerful since it allows DAGCombiner to manipulate the logical ops created.
default: | ||
llvm_unreachable("Unknown integer setcc!"); | ||
case ISD::SETEQ: // X == Y --> ~(X ^ Y) | ||
Ret = DAG.getNOT(dl, DAG.getNode(ISD::XOR, dl, MVT::i1, LHS, RHS), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the issue here is that there's more than one possible interpretation of "lower SetCC(i1) differently". NVPTX can do it via logical ops on predicates, other targets may not. On other targets it may be something else. Whoever would use "setCondCodeAction(...,Expand)" would have no idea what it would expand into. This should at least be documented. If/when we would encounter a target which would have a different way of expanding setcc, we would probably need a more nuanced hint to tell LLVM which flavor we actually want.
I think at the moment only Hexagon uses setCondCodeAction(...,i1, Expand)
. It would be good to check what effect this change has on them.
return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1, | ||
Equal ? ISD::SETEQ : ISD::SETNE); | ||
const ISD::CondCode CC = Equal ? ISD::SETEQ : ISD::SETNE; | ||
if (!LegalOperations || TLI.isCondCodeLegal(CC, Op0.getSimpleValueType())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Presumably it will infinite loop the combiner without this
@@ -11874,6 +11874,47 @@ bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, | |||
return true; | |||
} | |||
|
|||
// Special case: expand i1 comparisons using logical operations. | |||
if (OpVT == MVT::i1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this need the type special case? LegalizeSetCCCondCode should do as asked regardless of the type
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this need the type special case?
The logical simplifications we're using to expand setcc for i1 will not be correct for larger types.
LegalizeSetCCCondCode should do as asked regardless of the type
In theory yes, but this isn't the case today, and creating a valid expansion for all types x cond-codes would be a time consuming way to add a lot of in practice dead code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you have a more specific example for "not correct for larger types"? It's not an arbitrary wide typed value. The result conforms to getTargetBooleanContents, which in turn should respect all of the logical expansions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of these expansions take advantage of the unique nature of the i1 type. For example consider signed greater than, this will only be true for i1s if and only if the first operand is 0 and the second is -1, hence it can be expressed as ~X & Y. This sort of bit manipulation is of course totally invalid for any larger type.
To see that this transformation is okay for i1, and not for a larger type here is an alive2 where the transform has been expressed on llvm IR: https://alive2.llvm.org/ce/z/6jS_rA
5fda546
to
125a552
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a nit.
125a552
to
c8da53d
Compare
Add DAG legalization support for expanding i1 SETCC nodes using appropriate logical operations to simulate integer comparisons. Use these expansions to handle i1 SETCC in NVPTX.
fixes #58428 and #57405