Skip to content

Commit

Permalink
[GlobalISel][Localizer] Rewrite localizer to run in 2 phases, inter &…
Browse files Browse the repository at this point in the history
… intra block.

Inter-block localization is the same as what currently happens, except now it
only runs on the entry block because that's where the problematic constants with
long live ranges come from.

The second phase is a new intra-block localization phase which attempts to
re-sink the already localized instructions further right before one of the
multiple uses.

One additional change is to also localize G_GLOBAL_VALUE as they're constants
too. However, on some targets like arm64 it takes multiple instructions to
materialize the value, so some additional heuristics with a TTI hook have been
introduced attempt to prevent code size regressions when localizing these.

Overall, these changes improve CTMark code size on arm64 by 1.2%.

Full code size results:

Program                                         baseline       new       diff
------------------------------------------------------------------------------
 test-suite...-typeset/consumer-typeset.test    1249984      1217216     -2.6%
 test-suite...:: CTMark/ClamAV/clamscan.test    1264928      1232152     -2.6%
 test-suite :: CTMark/SPASS/SPASS.test          1394092      1361316     -2.4%
 test-suite...Mark/mafft/pairlocalalign.test    731320       714928      -2.2%
 test-suite :: CTMark/lencod/lencod.test        1340592      1324200     -1.2%
 test-suite :: CTMark/kimwitu++/kc.test         3853512      3820420     -0.9%
 test-suite :: CTMark/Bullet/bullet.test        3406036      3389652     -0.5%
 test-suite...ark/tramp3d-v4/tramp3d-v4.test    8017000      8016992     -0.0%
 test-suite...TMark/7zip/7zip-benchmark.test    2856588      2856588      0.0%
 test-suite...:: CTMark/sqlite3/sqlite3.test    765704       765704       0.0%
 Geomean difference                                                      -1.2%

Differential Revision: https://reviews.llvm.org/D63303

llvm-svn: 363632
  • Loading branch information
aemerson committed Jun 17, 2019
1 parent f9bff2a commit 1468822
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 62 deletions.
11 changes: 11 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,11 @@ class TargetTransformInfo {
/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
bool shouldExpandReduction(const IntrinsicInst *II) const;

/// \returns the size cost of rematerializing a GlobalValue address relative
/// to a stack reload.
unsigned getGISelRematGlobalCost() const;

/// @}

private:
Expand Down Expand Up @@ -1269,6 +1274,7 @@ class TargetTransformInfo::Concept {
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
virtual unsigned getGISelRematGlobalCost() const = 0;
virtual int getInstructionLatency(const Instruction *I) = 0;
};

Expand Down Expand Up @@ -1701,6 +1707,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
bool shouldExpandReduction(const IntrinsicInst *II) const override {
return Impl.shouldExpandReduction(II);
}

unsigned getGISelRematGlobalCost() const override {
return Impl.getGISelRematGlobalCost();
}

int getInstructionLatency(const Instruction *I) override {
return Impl.getInstructionLatency(I);
}
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,10 @@ class TargetTransformInfoImplBase {
return true;
}

unsigned getGISelRematGlobalCost() const {
return 1;
}

protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
Expand Down
12 changes: 11 additions & 1 deletion llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
namespace llvm {
// Forward declarations.
class MachineRegisterInfo;
class TargetTransformInfo;

/// This pass implements the localization mechanism described at the
/// top of this file. One specificity of the implementation is that
Expand All @@ -43,9 +44,11 @@ class Localizer : public MachineFunctionPass {
/// MRI contains all the register class/bank information that this
/// pass uses and updates.
MachineRegisterInfo *MRI;
/// TTI used for getting remat costs for instructions.
TargetTransformInfo *TTI;

/// Check whether or not \p MI needs to be moved close to its uses.
static bool shouldLocalize(const MachineInstr &MI);
bool shouldLocalize(const MachineInstr &MI);

/// Check if \p MOUse is used in the same basic block as \p Def.
/// If the use is in the same block, we say it is local.
Expand All @@ -57,6 +60,13 @@ class Localizer : public MachineFunctionPass {
/// Initialize the field members using \p MF.
void init(MachineFunction &MF);

/// Do inter-block localization from the entry block.
bool localizeInterBlock(MachineFunction &MF,
SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs);

/// Do intra-block localization of already localized instructions.
bool localizeIntraBlock(SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs);

public:
Localizer();

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,10 @@ bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}

unsigned TargetTransformInfo::getGISelRematGlobalCost() const {
return TTIImpl->getGISelRematGlobalCost();
}

int TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
return TTIImpl->getInstructionLatency(I);
}
Expand Down
219 changes: 158 additions & 61 deletions llvm/lib/CodeGen/GlobalISel/Localizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
Expand All @@ -20,17 +21,55 @@
using namespace llvm;

char Localizer::ID = 0;
INITIALIZE_PASS(Localizer, DEBUG_TYPE,
"Move/duplicate certain instructions close to their use", false,
false)
INITIALIZE_PASS_BEGIN(Localizer, DEBUG_TYPE,
"Move/duplicate certain instructions close to their use",
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(Localizer, DEBUG_TYPE,
"Move/duplicate certain instructions close to their use",
false, false)

Localizer::Localizer() : MachineFunctionPass(ID) {
initializeLocalizerPass(*PassRegistry::getPassRegistry());
}

void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); }
void Localizer::init(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
}

bool Localizer::shouldLocalize(const MachineInstr &MI) {
// Assuming a spill and reload of a value has a cost of 1 instruction each,
// this helper function computes the maximum number of uses we should consider
// for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
// break even in terms of code size when the original MI has 2 users vs
// choosing to potentially spill. Any more than 2 users we we have a net code
// size increase. This doesn't take into account register pressure though.
auto maxUses = [](unsigned RematCost) {
// A cost of 1 means remats are basically free.
if (RematCost == 1)
return UINT_MAX;
if (RematCost == 2)
return 2U;

// Remat is too expensive, only sink if there's one user.
if (RematCost > 2)
return 1U;
llvm_unreachable("Unexpected remat cost");
};

// Helper to walk through uses and terminate if we've reached a limit. Saves
// us spending time traversing uses if all we want to know is if it's >= min.
auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
unsigned NumUses = 0;
auto UI = MRI->use_instr_nodbg_begin(Reg), UE = MRI->use_instr_nodbg_end();
for (; UI != UE && NumUses < MaxUses; ++UI) {
NumUses++;
}
// If we haven't reached the end yet then there are more than MaxUses users.
return UI == UE;
};

switch (MI.getOpcode()) {
default:
return false;
Expand All @@ -40,10 +79,20 @@ bool Localizer::shouldLocalize(const MachineInstr &MI) {
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_FRAME_INDEX:
return true;
case TargetOpcode::G_GLOBAL_VALUE: {
unsigned RematCost = TTI->getGISelRematGlobalCost();
unsigned Reg = MI.getOperand(0).getReg();
unsigned MaxUses = maxUses(RematCost);
if (MaxUses == UINT_MAX)
return true; // Remats are "free" so always localize.
bool B = isUsesAtMost(Reg, MaxUses);
return B;
}
}
}

void Localizer::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetTransformInfoWrapperPass>();
getSelectionDAGFallbackAnalysisUsage(AU);
MachineFunctionPass::getAnalysisUsage(AU);
}
Expand All @@ -57,6 +106,106 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
return InsertMBB == Def.getParent();
}

bool Localizer::localizeInterBlock(
MachineFunction &MF, SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs) {
bool Changed = false;
DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;

// Since the IRTranslator only emits constants into the entry block, and the
// rest of the GISel pipeline generally emits constants close to their users,
// we only localize instructions in the entry block here. This might change if
// we start doing CSE across blocks.
auto &MBB = MF.front();
for (MachineInstr &MI : MBB) {
if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
continue;
LLVM_DEBUG(dbgs() << "Should localize: " << MI);
assert(MI.getDesc().getNumDefs() == 1 &&
"More than one definition not supported yet");
unsigned Reg = MI.getOperand(0).getReg();
// Check if all the users of MI are local.
// We are going to invalidation the list of use operands, so we
// can't use range iterator.
for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end();
MOIt != MOItEnd;) {
MachineOperand &MOUse = *MOIt++;
// Check if the use is already local.
MachineBasicBlock *InsertMBB;
LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
dbgs() << "Checking use: " << MIUse
<< " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
if (isLocalUse(MOUse, MI, InsertMBB))
continue;
LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
Changed = true;
auto MBBAndReg = std::make_pair(InsertMBB, Reg);
auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
if (NewVRegIt == MBBWithLocalDef.end()) {
// Create the localized instruction.
MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
LocalizedInstrs.insert(LocalizedMI);
MachineInstr &UseMI = *MOUse.getParent();
if (MRI->hasOneUse(Reg) && !UseMI.isPHI())
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(UseMI), LocalizedMI);
else
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()),
LocalizedMI);

// Set a new register for the definition.
unsigned NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
LocalizedMI->getOperand(0).setReg(NewReg);
NewVRegIt =
MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
}
LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second)
<< '\n');
// Update the user reg.
MOUse.setReg(NewVRegIt->second);
}
}
return Changed;
}

bool Localizer::localizeIntraBlock(
SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs) {
bool Changed = false;

// For each already-localized instruction which has multiple users, then we
// scan the block top down from the current position until we hit one of them.

// FIXME: Consider doing inst duplication if live ranges are very long due to
// many users, but this case may be better served by regalloc improvements.

for (MachineInstr *MI : LocalizedInstrs) {
unsigned Reg = MI->getOperand(0).getReg();
MachineBasicBlock &MBB = *MI->getParent();
// If the instruction has a single use, we would have already moved it right
// before its user in localizeInterBlock().
if (MRI->hasOneUse(Reg))
continue;

// All of the user MIs of this reg.
SmallPtrSet<MachineInstr *, 32> Users;
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg))
Users.insert(&UseMI);

MachineBasicBlock::iterator II(MI);
++II;
while (II != MBB.end() && !Users.count(&*II))
++II;

LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *&*II
<< "\n");
assert(II != MBB.end() && "Didn't find the user in the MBB");
MI->removeFromParent();
MBB.insert(II, MI);
Changed = true;
}
return Changed;
}

bool Localizer::runOnMachineFunction(MachineFunction &MF) {
// If the ISel pipeline failed, do not bother running that pass.
if (MF.getProperties().hasProperty(
Expand All @@ -67,62 +216,10 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {

init(MF);

bool Changed = false;
// Keep track of the instructions we localized.
// We won't need to process them if we see them later in the CFG.
SmallPtrSet<MachineInstr *, 16> LocalizedInstrs;
DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;
// TODO: Do bottom up traversal.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
continue;
LLVM_DEBUG(dbgs() << "Should localize: " << MI);
assert(MI.getDesc().getNumDefs() == 1 &&
"More than one definition not supported yet");
unsigned Reg = MI.getOperand(0).getReg();
// Check if all the users of MI are local.
// We are going to invalidation the list of use operands, so we
// can't use range iterator.
for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end();
MOIt != MOItEnd;) {
MachineOperand &MOUse = *MOIt++;
// Check if the use is already local.
MachineBasicBlock *InsertMBB;
LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
dbgs() << "Checking use: " << MIUse
<< " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
if (isLocalUse(MOUse, MI, InsertMBB))
continue;
LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
Changed = true;
auto MBBAndReg = std::make_pair(InsertMBB, Reg);
auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
if (NewVRegIt == MBBWithLocalDef.end()) {
// Create the localized instruction.
MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
LocalizedInstrs.insert(LocalizedMI);
// Don't try to be smart for the insertion point.
// There is no guarantee that the first seen use is the first
// use in the block.
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()),
LocalizedMI);
// Keep track of the instructions we localized. We'll do a second pass of
// intra-block localization to further reduce live ranges.
SmallPtrSet<MachineInstr *, 32> LocalizedInstrs;

// Set a new register for the definition.
unsigned NewReg =
MRI->createGenericVirtualRegister(MRI->getType(Reg));
MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
LocalizedMI->getOperand(0).setReg(NewReg);
NewVRegIt =
MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
}
LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second)
<< '\n');
// Update the user reg.
MOUse.setReg(NewVRegIt->second);
}
}
}
return Changed;
bool Changed = localizeInterBlock(MF, LocalizedInstrs);
return Changed |= localizeIntraBlock(LocalizedInstrs);
}
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return false;
}

unsigned getGISelRematGlobalCost() const {
return 2;
}

bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;

Expand Down
Loading

0 comments on commit 1468822

Please sign in to comment.