Skip to content

[DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences #146054

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: users/pierre-vh/tests-for-id-intrinsic-opts
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28912,13 +28912,100 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
return SDValue();
}

static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
const TargetLowering &TLI) {
// Match a pattern such as:
// (X | (X >> C0) | (X >> C1) | ...) & Mask
// This extracts contiguous parts of X and ORs them together before comparing.
// We can optimize this so that we directly check (X & SomeMask) instead,
// eliminating the shifts.

EVT VT = Root.getValueType();

// TODO: Support vectors?
if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND)
return SDValue();

SDValue N0 = Root.getOperand(0);
SDValue N1 = Root.getOperand(1);

if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
return SDValue();

APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();

SDValue Src;
const auto IsSrc = [&](SDValue V) {
if (!Src) {
Src = V;
return true;
}

return Src == V;
};

SmallVector<SDValue> Worklist = {N0};
APInt PartsMask(VT.getSizeInBits(), 0);
while (!Worklist.empty()) {
SDValue V = Worklist.pop_back_val();
if (!V.hasOneUse() && (Src && Src != V))
return SDValue();

if (V.getOpcode() == ISD::OR) {
Worklist.push_back(V.getOperand(0));
Worklist.push_back(V.getOperand(1));
continue;
}

if (V.getOpcode() == ISD::SRL) {
SDValue ShiftSrc = V.getOperand(0);
SDValue ShiftAmt = V.getOperand(1);

if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
return SDValue();

auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal();
if (ShiftAmtVal > RootMask.getBitWidth())
return SDValue();

PartsMask |= (RootMask << ShiftAmtVal);
continue;
}

if (IsSrc(V)) {
PartsMask |= RootMask;
continue;
}

return SDValue();
}

if (!Src)
return SDValue();

SDLoc DL(Root);
return DAG.getNode(ISD::AND, DL, VT,
{Src, DAG.getConstant(PartsMask, DL, VT)});
}

/// This is a stub for TargetLowering::SimplifySetCC.
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, const SDLoc &DL,
bool foldBooleans) {
TargetLowering::DAGCombinerInfo
DagCombineInfo(DAG, Level, false, this);
return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
if (SDValue C =
TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
return C;

if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND &&
isNullConstant(N1)) {

if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
return DAG.getSetCC(DL, VT, Res, N1, Cond);
}

return SDValue();
}

/// Given an ISD::SDIV node expressing a divide by constant, return
Expand Down
123 changes: 123 additions & 0 deletions llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -O3 -mtriple=amdgcn -mcpu=fiji %s -o - | FileCheck %s

define i1 @basic_eq_i16_3x5(i16 %arg) {
; CHECK-LABEL: basic_eq_i16_3x5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%a = and i16 %arg, 31
%sh5 = lshr i16 %arg, 5
%b = and i16 %sh5, 31
%or = or i16 %a, %b
%sh10 = lshr i16 %arg, 10
%c = and i16 %sh10, 31
%or1 = or i16 %or, %c
%cmp = icmp eq i16 %or1, 0
ret i1 %cmp
}

define i1 @basic_eq_i32_3x5(i32 %arg) {
; CHECK-LABEL: basic_eq_i32_3x5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%a = and i32 %arg, 31
%sh5 = lshr i32 %arg, 5
%b = and i32 %sh5, 31
%or = or i32 %a, %b
%sh10 = lshr i32 %arg, 10
%c = and i32 %sh10, 31
%or1 = or i32 %or, %c
%cmp = icmp eq i32 %or1, 0
ret i1 %cmp
}

define i1 @basic_eq_i64_3x5(i64 %arg) {
; CHECK-LABEL: basic_eq_i64_3x5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%a = and i64 %arg, 31
%sh5 = lshr i64 %arg, 5
%b = and i64 %sh5, 31
%or = or i64 %a, %b
%sh10 = lshr i64 %arg, 10
%c = and i64 %sh10, 31
%or1 = or i64 %or, %c
%cmp = icmp eq i64 %or1, 0
ret i1 %cmp
}

define i1 @basic_ne_i32_3x5(i32 %arg) {
; CHECK-LABEL: basic_ne_i32_3x5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%a = and i32 %arg, 31
%sh5 = lshr i32 %arg, 5
%b = and i32 %sh5, 31
%or = or i32 %a, %b
%sh10 = lshr i32 %arg, 10
%c = and i32 %sh10, 31
%or1 = or i32 %or, %c
%cmp = icmp ne i32 %or1, 0
ret i1 %cmp
}

define i1 @eq_i32_3x5_holes_in_mask(i32 %arg) {
; CHECK-LABEL: eq_i32_3x5_holes_in_mask:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x7f9f, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%a = and i32 %arg, 31
%sh5 = lshr i32 %arg, 7
%b = and i32 %sh5, 31
%or = or i32 %a, %b
%sh10 = lshr i32 %arg, 10
%c = and i32 %sh10, 31
%or1 = or i32 %or, %c
%cmp = icmp ne i32 %or1, 0
ret i1 %cmp
}

define i1 @eq_i32_3x5_all_shifted(i32 %arg) {
; CHECK-LABEL: eq_i32_3x5_all_shifted:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffc, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%sh2 = lshr i32 %arg, 2
%a = and i32 %sh2, 31
%sh5 = lshr i32 %arg, 7
%b = and i32 %sh5, 31
%or = or i32 %a, %b
%sh10 = lshr i32 %arg, 10
%c = and i32 %sh10, 31
%or1 = or i32 %or, %c
%cmp = icmp ne i32 %or1, 0
ret i1 %cmp
}
34 changes: 6 additions & 28 deletions llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,15 @@ define i1 @workitem_zero() {
; DAGISEL-GFX8-LABEL: workitem_zero:
; DAGISEL-GFX8: ; %bb.0: ; %entry
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
;
; DAGISEL-GFX942-LABEL: workitem_zero:
; DAGISEL-GFX942: ; %bb.0: ; %entry
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; DAGISEL-GFX942-NEXT: s_nop 1
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
Expand All @@ -40,11 +33,7 @@ define i1 @workitem_zero() {
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
Expand Down Expand Up @@ -106,22 +95,15 @@ define i1 @workitem_nonzero() {
; DAGISEL-GFX8-LABEL: workitem_nonzero:
; DAGISEL-GFX8: ; %bb.0: ; %entry
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
;
; DAGISEL-GFX942-LABEL: workitem_nonzero:
; DAGISEL-GFX942: ; %bb.0: ; %entry
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; DAGISEL-GFX942-NEXT: s_nop 1
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
Expand All @@ -134,11 +116,7 @@ define i1 @workitem_nonzero() {
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
Expand Down