Skip to content

[WIP][AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #128687

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,17 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;

void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();

struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
const AMDGPUTargetMachine &TM;
AMDGPUUniformIntrinsicCombinePass(const AMDGPUTargetMachine &TM) : TM(TM) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass(*this))
#undef FUNCTION_PASS

#ifndef FUNCTION_ANALYSIS
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);

static cl::opt<bool> EnableUniformIntrinsicCombine(
"amdgpu-enable-uniform-intrinsic-combine",
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
cl::init(true), cl::Hidden);

extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
Expand Down Expand Up @@ -560,6 +565,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGCNRegPressurePrinterPass(*PR);
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
}

static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
Expand Down Expand Up @@ -829,13 +835,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});

PB.registerPeepholeEPCallback(
[](FunctionPassManager &FPM, OptimizationLevel Level) {
[this](FunctionPassManager &FPM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return;

FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());

if (EnableUniformIntrinsicCombine)
FPM.addPass(AMDGPUUniformIntrinsicCombinePass(*this));
});

PB.registerCGSCCOptimizerLateEPCallback(
Expand Down Expand Up @@ -1213,6 +1222,8 @@ void AMDGPUPassConfig::addIRPasses() {
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));

if (EnableUniformIntrinsicCombine)
addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
Expand Down
164 changes: 164 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass simplifies certain intrinsic calls when the arguments are uniform.
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"

using namespace llvm;
using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;

namespace {
class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
public:
static char ID;
AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
initializeAMDGPUUniformIntrinsicCombineLegacyPass(
*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
}
};

class AMDGPUUniformIntrinsicCombineImpl
: public InstVisitor<AMDGPUUniformIntrinsicCombineImpl> {
private:
const UniformityInfo *UI;
bool optimizeUniformIntrinsicInst(IntrinsicInst &II) const;

public:
AMDGPUUniformIntrinsicCombineImpl() = delete;
AMDGPUUniformIntrinsicCombineImpl(const UniformityInfo *UI) : UI(UI) {}
bool run(Function &F);
};
} // namespace

char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
AMDGPUUniformIntrinsicCombineLegacy::ID;

bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
if (skipFunction(F)) {
return false;
}
const UniformityInfo *UI =
&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
return AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
}

PreservedAnalyses
AMDGPUUniformIntrinsicCombinePass::run(Function &F,
FunctionAnalysisManager &AM) {
const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);

if (!IsChanged) {
return PreservedAnalyses::all();
}
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<LoopAnalysis>();
PA.preserve<ScalarEvolutionAnalysis>();
PA.preserve<UniformityInfoAnalysis>();
PA.preserve<TargetLibraryAnalysis>();
return PA;
}

bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
bool IsChanged{false};

// Iterate over each instruction in the function to get the desired intrinsic
// inst to check for optimization.
for (Instruction &I : make_early_inc_range(instructions(F))) {
if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
}
}
return IsChanged;
}

bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
IntrinsicInst &II) const {
llvm::Intrinsic::ID IID = II.getIntrinsicID();

switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
// Check if the argument use is divergent
if (UI->isDivergentUse(II.getOperandUse(0)))
return false;
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << "\n");
II.replaceAllUsesWith(Src);
return true;
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
if (UI->isDivergentUse(II.getOperandUse(0)))
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");

bool Changed = false;
for (User *U : make_early_inc_range(II.users())) {
if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
Value *Op0 = ICmp->getOperand(0);
Value *Op1 = ICmp->getOperand(1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be enough to only check Op1, since only the rhs of the comparison should be the constant.

You could even match the whole comparison:

match(U, m_ICmp(CmpInst::ICMP_EQ, m_Specific(&II), m_Zero()))


if (ICmp->getPredicate() == ICmpInst::ICMP_EQ &&
((Op0 == &II && match(Op1, m_Zero())) ||
(Op1 == &II && match(Op0, m_Zero())))) {

IRBuilder<> Builder(ICmp);
Value *Xor = Builder.CreateXor(Src, Builder.getTrue());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Value *Xor = Builder.CreateXor(Src, Builder.getTrue());
Value *Not = Builder.CreateNot(Src);

LLVM_DEBUG(dbgs() << "Replacing with XOR: " << *Xor << "\n");
ICmp->replaceAllUsesWith(Xor);
Changed = true;
}
}
}
return Changed;
}
}
return false;
}

INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU uniformIntrinsic Combine", false, false)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU uniformIntrinsic Combine", false, false)

FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
return new AMDGPUUniformIntrinsicCombineLegacy();
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
Expand Down
42 changes: 19 additions & 23 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_2: ; %true
; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
Expand Down Expand Up @@ -259,17 +260,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_2: ; %true
; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
Expand Down Expand Up @@ -374,16 +371,15 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_2: ; %true
; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
Expand Down
46 changes: 21 additions & 25 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -165,16 +165,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_2: ; %true
; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
Expand Down Expand Up @@ -262,17 +263,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_2: ; %true
; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
Expand Down Expand Up @@ -377,16 +374,15 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_2: ; %true
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
Expand Down
Loading
Loading