[GlobalISel][Localizer] Rewrite localizer to run in 2 phases, inter & intra block.

Inter-block localization is the same as what currently happens, except now it
only runs on the entry block because that's where the problematic constants with
long live ranges come from.

The second phase is a new intra-block localization phase which attempts to
re-sink the already localized instructions further right before one of the
multiple uses.

One additional change is to also localize G_GLOBAL_VALUE as they're constants
too. However, on some targets like arm64 it takes multiple instructions to
materialize the value, so some additional heuristics with a TTI hook have been
introduced attempt to prevent code size regressions when localizing these.

Overall, these changes improve CTMark code size on arm64 by 1.2%.

Full code size results:

Program                                         baseline       new       diff
------------------------------------------------------------------------------
 test-suite...-typeset/consumer-typeset.test    1249984      1217216     -2.6%
 test-suite...:: CTMark/ClamAV/clamscan.test    1264928      1232152     -2.6%
 test-suite :: CTMark/SPASS/SPASS.test          1394092      1361316     -2.4%
 test-suite...Mark/mafft/pairlocalalign.test    731320       714928      -2.2%
 test-suite :: CTMark/lencod/lencod.test        1340592      1324200     -1.2%
 test-suite :: CTMark/kimwitu++/kc.test         3853512      3820420     -0.9%
 test-suite :: CTMark/Bullet/bullet.test        3406036      3389652     -0.5%
 test-suite...ark/tramp3d-v4/tramp3d-v4.test    8017000      8016992     -0.0%
 test-suite...TMark/7zip/7zip-benchmark.test    2856588      2856588      0.0%
 test-suite...:: CTMark/sqlite3/sqlite3.test    765704       765704       0.0%
 Geomean difference                                                      -1.2%

Differential Revision: https://reviews.llvm.org/D63303

llvm-svn: 363632
This commit is contained in:
Amara Emerson 2019-06-17 23:20:29 +00:00
parent c1dc4dff1a
commit 2f870f9e0c
8 changed files with 341 additions and 62 deletions

View File

@ -1053,6 +1053,11 @@ public:
/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
bool shouldExpandReduction(const IntrinsicInst *II) const;
/// \returns the size cost of rematerializing a GlobalValue address relative
/// to a stack reload.
unsigned getGISelRematGlobalCost() const;
/// @}
private:
@ -1269,6 +1274,7 @@ public:
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
virtual unsigned getGISelRematGlobalCost() const = 0;
virtual int getInstructionLatency(const Instruction *I) = 0;
};
@ -1701,6 +1707,11 @@ public:
bool shouldExpandReduction(const IntrinsicInst *II) const override {
return Impl.shouldExpandReduction(II);
}
unsigned getGISelRematGlobalCost() const override {
return Impl.getGISelRematGlobalCost();
}
int getInstructionLatency(const Instruction *I) override {
return Impl.getInstructionLatency(I);
}

View File

@ -572,6 +572,10 @@ public:
return true;
}
unsigned getGISelRematGlobalCost() const {
return 1;
}
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.

View File

@ -27,6 +27,7 @@
namespace llvm {
// Forward declarations.
class MachineRegisterInfo;
class TargetTransformInfo;
/// This pass implements the localization mechanism described at the
/// top of this file. One specificity of the implementation is that
@ -43,9 +44,11 @@ private:
/// MRI contains all the register class/bank information that this
/// pass uses and updates.
MachineRegisterInfo *MRI;
/// TTI used for getting remat costs for instructions.
TargetTransformInfo *TTI;
/// Check whether or not \p MI needs to be moved close to its uses.
static bool shouldLocalize(const MachineInstr &MI);
bool shouldLocalize(const MachineInstr &MI);
/// Check if \p MOUse is used in the same basic block as \p Def.
/// If the use is in the same block, we say it is local.
@ -57,6 +60,13 @@ private:
/// Initialize the field members using \p MF.
void init(MachineFunction &MF);
/// Do inter-block localization from the entry block.
bool localizeInterBlock(MachineFunction &MF,
SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs);
/// Do intra-block localization of already localized instructions.
bool localizeIntraBlock(SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs);
public:
Localizer();

View File

@ -724,6 +724,10 @@ bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}
unsigned TargetTransformInfo::getGISelRematGlobalCost() const {
return TTIImpl->getGISelRematGlobalCost();
}
int TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
return TTIImpl->getInstructionLatency(I);
}

View File

@ -10,6 +10,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@ -20,17 +21,55 @@
using namespace llvm;
char Localizer::ID = 0;
INITIALIZE_PASS(Localizer, DEBUG_TYPE,
"Move/duplicate certain instructions close to their use", false,
false)
INITIALIZE_PASS_BEGIN(Localizer, DEBUG_TYPE,
"Move/duplicate certain instructions close to their use",
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(Localizer, DEBUG_TYPE,
"Move/duplicate certain instructions close to their use",
false, false)
Localizer::Localizer() : MachineFunctionPass(ID) {
initializeLocalizerPass(*PassRegistry::getPassRegistry());
}
void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); }
void Localizer::init(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
}
bool Localizer::shouldLocalize(const MachineInstr &MI) {
// Assuming a spill and reload of a value has a cost of 1 instruction each,
// this helper function computes the maximum number of uses we should consider
// for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
// break even in terms of code size when the original MI has 2 users vs
// choosing to potentially spill. Any more than 2 users we we have a net code
// size increase. This doesn't take into account register pressure though.
auto maxUses = [](unsigned RematCost) {
// A cost of 1 means remats are basically free.
if (RematCost == 1)
return UINT_MAX;
if (RematCost == 2)
return 2U;
// Remat is too expensive, only sink if there's one user.
if (RematCost > 2)
return 1U;
llvm_unreachable("Unexpected remat cost");
};
// Helper to walk through uses and terminate if we've reached a limit. Saves
// us spending time traversing uses if all we want to know is if it's >= min.
auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
unsigned NumUses = 0;
auto UI = MRI->use_instr_nodbg_begin(Reg), UE = MRI->use_instr_nodbg_end();
for (; UI != UE && NumUses < MaxUses; ++UI) {
NumUses++;
}
// If we haven't reached the end yet then there are more than MaxUses users.
return UI == UE;
};
switch (MI.getOpcode()) {
default:
return false;
@ -40,10 +79,20 @@ bool Localizer::shouldLocalize(const MachineInstr &MI) {
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_FRAME_INDEX:
return true;
case TargetOpcode::G_GLOBAL_VALUE: {
unsigned RematCost = TTI->getGISelRematGlobalCost();
unsigned Reg = MI.getOperand(0).getReg();
unsigned MaxUses = maxUses(RematCost);
if (MaxUses == UINT_MAX)
return true; // Remats are "free" so always localize.
bool B = isUsesAtMost(Reg, MaxUses);
return B;
}
}
}
void Localizer::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetTransformInfoWrapperPass>();
getSelectionDAGFallbackAnalysisUsage(AU);
MachineFunctionPass::getAnalysisUsage(AU);
}
@ -57,23 +106,16 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
return InsertMBB == Def.getParent();
}
bool Localizer::runOnMachineFunction(MachineFunction &MF) {
// If the ISel pipeline failed, do not bother running that pass.
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;
LLVM_DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n');
init(MF);
bool Localizer::localizeInterBlock(
MachineFunction &MF, SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs) {
bool Changed = false;
// Keep track of the instructions we localized.
// We won't need to process them if we see them later in the CFG.
SmallPtrSet<MachineInstr *, 16> LocalizedInstrs;
DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;
// TODO: Do bottom up traversal.
for (MachineBasicBlock &MBB : MF) {
// Since the IRTranslator only emits constants into the entry block, and the
// rest of the GISel pipeline generally emits constants close to their users,
// we only localize instructions in the entry block here. This might change if
// we start doing CSE across blocks.
auto &MBB = MF.front();
for (MachineInstr &MI : MBB) {
if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
continue;
@ -102,15 +144,15 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
// Create the localized instruction.
MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
LocalizedInstrs.insert(LocalizedMI);
// Don't try to be smart for the insertion point.
// There is no guarantee that the first seen use is the first
// use in the block.
MachineInstr &UseMI = *MOUse.getParent();
if (MRI->hasOneUse(Reg) && !UseMI.isPHI())
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(UseMI), LocalizedMI);
else
InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()),
LocalizedMI);
// Set a new register for the definition.
unsigned NewReg =
MRI->createGenericVirtualRegister(MRI->getType(Reg));
unsigned NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
LocalizedMI->getOperand(0).setReg(NewReg);
NewVRegIt =
@ -123,6 +165,61 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
MOUse.setReg(NewVRegIt->second);
}
}
return Changed;
}
bool Localizer::localizeIntraBlock(
SmallPtrSetImpl<MachineInstr *> &LocalizedInstrs) {
bool Changed = false;
// For each already-localized instruction which has multiple users, then we
// scan the block top down from the current position until we hit one of them.
// FIXME: Consider doing inst duplication if live ranges are very long due to
// many users, but this case may be better served by regalloc improvements.
for (MachineInstr *MI : LocalizedInstrs) {
unsigned Reg = MI->getOperand(0).getReg();
MachineBasicBlock &MBB = *MI->getParent();
// If the instruction has a single use, we would have already moved it right
// before its user in localizeInterBlock().
if (MRI->hasOneUse(Reg))
continue;
// All of the user MIs of this reg.
SmallPtrSet<MachineInstr *, 32> Users;
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg))
Users.insert(&UseMI);
MachineBasicBlock::iterator II(MI);
++II;
while (II != MBB.end() && !Users.count(&*II))
++II;
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *&*II
<< "\n");
assert(II != MBB.end() && "Didn't find the user in the MBB");
MI->removeFromParent();
MBB.insert(II, MI);
Changed = true;
}
return Changed;
}
bool Localizer::runOnMachineFunction(MachineFunction &MF) {
// If the ISel pipeline failed, do not bother running that pass.
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;
LLVM_DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n');
init(MF);
// Keep track of the instructions we localized. We'll do a second pass of
// intra-block localization to further reduce live ranges.
SmallPtrSet<MachineInstr *, 32> LocalizedInstrs;
bool Changed = localizeInterBlock(MF, LocalizedInstrs);
return Changed |= localizeIntraBlock(LocalizedInstrs);
}

View File

@ -165,6 +165,10 @@ public:
return false;
}
unsigned getGISelRematGlobalCost() const {
return 2;
}
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;

View File

@ -0,0 +1,62 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -o - -verify-machineinstrs -O0 -global-isel -stop-after=localizer %s | FileCheck %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios5.0.0"
@var1 = common global i32 0, align 4
@var2 = common global i32 0, align 4
@var3 = common global i32 0, align 4
@var4 = common global i32 0, align 4
; This is an ll test instead of MIR because -run-pass doesn't seem to support
; initializing the target TTI which we need for this test.
; Some of the instructions in entry block are dead after this pass so don't
; strictly need to be checked for.
define i32 @foo() {
; CHECK-LABEL: name: foo
; CHECK: bb.1.entry:
; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; CHECK: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2
; CHECK: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3
; CHECK: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
; CHECK: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; CHECK: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV]](p0) :: (load 4 from @var1)
; CHECK: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(eq), [[LOAD]](s32), [[C]]
; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[ICMP]](s32)
; CHECK: G_BRCOND [[TRUNC]](s1), %bb.2
; CHECK: G_BR %bb.3
; CHECK: bb.2.if.then:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
; CHECK: [[C4:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2
; CHECK: G_STORE [[C4]](s32), [[GV3]](p0) :: (store 4 into @var2)
; CHECK: [[C5:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3
; CHECK: G_STORE [[C5]](s32), [[GV]](p0) :: (store 4 into @var1)
; CHECK: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
; CHECK: G_STORE [[C4]](s32), [[GV4]](p0) :: (store 4 into @var3)
; CHECK: G_STORE [[C5]](s32), [[GV]](p0) :: (store 4 into @var1)
; CHECK: bb.3.if.end:
; CHECK: [[C6:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; CHECK: $w0 = COPY [[C6]](s32)
; CHECK: RET_ReallyLR implicit $w0
entry:
%0 = load i32, i32* @var1, align 4
%cmp = icmp eq i32 %0, 1
br i1 %cmp, label %if.then, label %if.end
if.then:
store i32 2, i32* @var2, align 4
store i32 3, i32* @var1, align 4
store i32 2, i32* @var3, align 4
store i32 3, i32* @var1, align 4
br label %if.end
if.end:
ret i32 0
}

View File

@ -15,6 +15,29 @@
define void @float_non_local_phi_use_followed_by_use_fi() { ret void }
define void @non_local_phi() { ret void }
define void @non_local_label() { ret void }
@var1 = common global i32 0, align 4
@var2 = common global i32 0, align 4
@var3 = common global i32 0, align 4
@var4 = common global i32 0, align 4
define i32 @intrablock_with_globalvalue() {
entry:
%0 = load i32, i32* @var1, align 4
%cmp = icmp eq i32 %0, 1
br i1 %cmp, label %if.then, label %if.end
if.then:
store i32 2, i32* @var2, align 4
store i32 3, i32* @var1, align 4
store i32 2, i32* @var3, align 4
store i32 3, i32* @var1, align 4
br label %if.end
if.end:
ret i32 0
}
...
---
@ -301,3 +324,67 @@ body: |
%2:fpr(s32) = G_FADD %0, %1
G_BR %bb.1
...
---
name: intrablock_with_globalvalue
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: intrablock_with_globalvalue
; CHECK: bb.0.entry:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2
; CHECK: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3
; CHECK: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
; CHECK: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; CHECK: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV]](p0) :: (load 4 from @var1)
; CHECK: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(eq), [[LOAD]](s32), [[C]]
; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[ICMP]](s32)
; CHECK: G_BRCOND [[TRUNC]](s1), %bb.1
; CHECK: G_BR %bb.2
; CHECK: bb.1.if.then:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
; CHECK: [[C4:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2
; CHECK: G_STORE [[C4]](s32), [[GV3]](p0) :: (store 4 into @var2)
; CHECK: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
; CHECK: [[C5:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3
; CHECK: G_STORE [[C5]](s32), [[GV4]](p0) :: (store 4 into @var1)
; CHECK: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
; CHECK: G_STORE [[C4]](s32), [[GV5]](p0) :: (store 4 into @var3)
; CHECK: G_STORE [[C5]](s32), [[GV4]](p0) :: (store 4 into @var1)
; CHECK: bb.2.if.end:
; CHECK: [[C6:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; CHECK: $w0 = COPY [[C6]](s32)
; CHECK: RET_ReallyLR implicit $w0
; Some of these instructions are dead. We're checking that the other instructions are
; sunk immediately before their first user in the if.then block or as close as possible.
bb.1.entry:
%1:gpr(p0) = G_GLOBAL_VALUE @var1
%2:gpr(s32) = G_CONSTANT i32 1
%4:gpr(s32) = G_CONSTANT i32 2
%5:gpr(p0) = G_GLOBAL_VALUE @var2
%6:gpr(s32) = G_CONSTANT i32 3
%7:gpr(p0) = G_GLOBAL_VALUE @var3
%8:gpr(s32) = G_CONSTANT i32 0
%0:gpr(s32) = G_LOAD %1(p0) :: (load 4 from @var1)
%9:gpr(s32) = G_ICMP intpred(eq), %0(s32), %2
%3:gpr(s1) = G_TRUNC %9(s32)
G_BRCOND %3(s1), %bb.2
G_BR %bb.3
bb.2.if.then:
G_STORE %4(s32), %5(p0) :: (store 4 into @var2)
G_STORE %6(s32), %1(p0) :: (store 4 into @var1)
G_STORE %4(s32), %7(p0) :: (store 4 into @var3)
G_STORE %6(s32), %1(p0) :: (store 4 into @var1)
bb.3.if.end:
$w0 = COPY %8(s32)
RET_ReallyLR implicit $w0
...