Skip to content

[AMDGPU] Swap select operands to allow later v_cndmask shrinking into vop2 #142140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ class AMDGPUCodeGenPrepareImpl
FastMathFlags FMF) const;
Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
FastMathFlags FMF) const;
bool swapSelectOperands(CmpInst &I);
bool isFnegOrFabs(Value &V);

public:
bool visitFDiv(BinaryOperator &I);
Expand All @@ -321,6 +323,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitBinaryOperator(BinaryOperator &I);
bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);
bool visitFCmpInst(FCmpInst &I);
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
Expand Down Expand Up @@ -891,6 +894,66 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
return Builder.CreateFMul(Rsq, OutputScaleFactor);
}

bool AMDGPUCodeGenPrepareImpl::isFnegOrFabs(Value &V) {
Instruction *I = dyn_cast<Instruction>(&V);
if (!I)
return false;

if (I->getOpcode() == Instruction::FNeg)
return true;

if (!isa<CallInst>(I))
return false;

auto CallI = dyn_cast<CallInst>(I);
auto CallF = CallI->getCalledFunction();
return CallF->isIntrinsic() && CallF->getIntrinsicID() == Intrinsic::fabs;
return true;
}

// check if select operands should be swapped
// so that v_cndmask can be later shrinked into vop2
bool AMDGPUCodeGenPrepareImpl::swapSelectOperands(CmpInst &I) {
int ShouldSwap = 0;
for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
auto User = Use->getUser();

if (!isa<SelectInst>(User))
return false;

auto SelectI = dyn_cast<SelectInst>(User);

auto Op1 = SelectI->getOperand(1);
auto Op2 = SelectI->getOperand(2);

if (isFnegOrFabs(*Op1) || isFnegOrFabs(*Op2))
continue;

// if the operand is defined by fneg or fabs it means the instruction
// will have source modifiers and therefore can't be shrinked to vop2
if (!UA.isDivergent(Op1) && UA.isDivergent(Op2))
ShouldSwap++;
else if (UA.isDivergent(Op1) && !UA.isDivergent(Op2))
ShouldSwap--;
}

if (ShouldSwap <= 0)
return false;

// swapping operands requires us to invert the comparison
I.setPredicate(I.getInversePredicate());

for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
auto SelectI = dyn_cast<Instruction>(Use->getUser());

auto Op = SelectI->getOperand(1);

SelectI->setOperand(1, SelectI->getOperand(2));
SelectI->setOperand(2, Op);
}
return true;
}

bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
FastMathFlags DivFMF,
FastMathFlags SqrtFMF) const {
Expand Down Expand Up @@ -1768,13 +1831,19 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
return false;
}

bool AMDGPUCodeGenPrepareImpl::visitFCmpInst(FCmpInst &I){
return swapSelectOperands(I);
}

bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
bool Changed = false;

if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
UA.isUniform(&I))
Changed |= promoteUniformOpToI32(I);

Changed |= swapSelectOperands(I);

return Changed;
}

Expand Down
10 changes: 6 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -547,14 +547,16 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2,
; GFX10-NEXT: s_xor_b32 s5, exec_lo, s5
; GFX10-NEXT: ; %bb.3: ; %.loopexit
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: v_cmp_gt_i32_e64 s0, v5, v0
; GFX10-NEXT: v_cmp_le_i32_e64 s0, v5, v0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_mov_b32 s7, exec_lo
; GFX10-NEXT: s_xor_b32 s6, vcc_lo, s6
; GFX10-NEXT: s_mov_b32 s8, exec_lo
; GFX10-NEXT: s_xor_b32 s7, s0, s7
; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
; GFX10-NEXT: s_or_b32 s6, s0, s6
; GFX10-NEXT: s_or_b32 s6, s7, s6
; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-NEXT: s_xor_b32 s6, s6, s7
; GFX10-NEXT: s_xor_b32 s6, s6, s8
; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo
; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
; GFX10-NEXT: s_or_b32 s3, s3, s0
Expand Down Expand Up @@ -588,7 +590,7 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2,
; GFX10-NEXT: s_branch .LBB6_1
; GFX10-NEXT: .LBB6_8: ; %.exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v3, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s2
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
.entry:
Expand Down
109 changes: 54 additions & 55 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ define half @test_s16(half %a) #0 {
; GCN-LABEL: test_s16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_cmp_ngt_f16_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%fcmp = fcmp olt half %a, 0.0
Expand All @@ -18,8 +18,8 @@ define float @test_s32(float %a) #0 {
; GCN-LABEL: test_s32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%fcmp = fcmp olt float %a, 0.0
Expand All @@ -31,9 +31,9 @@ define double @test_s64(double %a) #0 {
; GCN-LABEL: test_s64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GCN-NEXT: v_cmp_ngt_f64_e32 vcc, 0, v[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%fcmp = fcmp olt double %a, 0.0
Expand All @@ -45,20 +45,19 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 {
; GCN-LABEL: test_v4s16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NEXT: v_cndmask_b32_e64 v5, v0, 0, vcc
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, v4 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v2, v1, 0, vcc
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, v4 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, s[4:5]
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_cmp_nlt_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cmp_ngt_f16_e64 s[6:7], 0, v0
; GCN-NEXT: v_cmp_nlt_f16_sdwa s[4:5], v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[6:7]
; GCN-NEXT: v_cndmask_b32_sdwa v0, v2, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GCN-NEXT: v_cmp_ngt_f16_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
; GCN-NEXT: s_mov_b64 vcc, s[4:5]
; GCN-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
Expand All @@ -72,29 +71,29 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
; GCN-NEXT: v_cmp_ngt_f16_e32 vcc, 0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GCN-NEXT: v_cndmask_b32_e64 v9, v0, 0, vcc
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
; GCN-NEXT: v_cmp_nlt_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; GCN-NEXT: v_cmp_ngt_f16_e32 vcc, 0, v1
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
; GCN-NEXT: v_cmp_nlt_f16_sdwa vcc, v1, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
; GCN-NEXT: v_cmp_ngt_f16_e32 vcc, 0, v2
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NEXT: v_cndmask_b32_e64 v1, v5, 0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v2, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v3
; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v2, vcc
; GCN-NEXT: v_cmp_nlt_f16_sdwa vcc, v2, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
; GCN-NEXT: v_cmp_ngt_f16_e32 vcc, 0, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GCN-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v6, v3, 0, vcc
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v3, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc
; GCN-NEXT: v_cmp_nlt_f16_sdwa vcc, v3, v8 src0_sel:WORD_1 src1_sel:DWORD
; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GCN-NEXT: v_cndmask_b32_e64 v3, v7, 0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v9
; GCN-NEXT: v_lshl_or_b32 v2, v2, 16, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6
Expand All @@ -111,10 +110,10 @@ define <2 x float> @test_v2s32(<2 x float> %a) #0 {
; GCN-LABEL: test_v2s32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%fcmp = fcmp olt <2 x float> %a, zeroinitializer
Expand All @@ -126,14 +125,14 @@ define <4 x float> @test_v4s32(<4 x float> %a) #0 {
; GCN-LABEL: test_v4s32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%fcmp = fcmp olt <4 x float> %a, zeroinitializer
Expand All @@ -145,12 +144,12 @@ define <2 x double> @test_v2s64(<2 x double> %a) #0 {
; GCN-LABEL: test_v2s64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[0:1]
; GCN-NEXT: v_cmp_gt_f64_e64 s[4:5], 0, v[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5]
; GCN-NEXT: v_cmp_ngt_f64_e32 vcc, 0, v[0:1]
; GCN-NEXT: v_cmp_ngt_f64_e64 s[4:5], 0, v[2:3]
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%fcmp = fcmp olt <2 x double> %a, zeroinitializer
Expand Down
Loading
Loading