[CodeGenPrepare] Create more extloads and fewer ands

Summary:
Add and instructions immediately after loads that only have their low
bits used, assuming that the (and (load x) c) will be matched as a
extload and the ands/truncs fed by the extload will be removed by isel.

Reviewers: mcrosier, qcolombet, ab

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D14584

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253722 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Geoff Berry 2015-11-20 22:34:39 +00:00
parent d43d18c48b
commit 2d2aadfa70
3 changed files with 315 additions and 1 deletions

View File

@ -64,6 +64,9 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
"computations were sunk");
STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");
STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");
STATISTIC(NumAndsAdded,
"Number of and mask instructions added to form ext loads");
STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
STATISTIC(NumRetsDup, "Number of return instructions duplicated");
STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
@ -173,6 +176,7 @@ class TypePromotionTransaction;
bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
bool moveExtToFormExtLoad(Instruction *&I);
bool optimizeExtUses(Instruction *I);
bool optimizeLoadExt(LoadInst *I);
bool optimizeSelectInst(SelectInst *SI);
bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
bool optimizeSwitchInst(SwitchInst *CI);
@ -4256,6 +4260,189 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
return MadeChange;
}
// Find loads whose uses only use some of the loaded value's bits. Add an "and"
// just after the load if the target can fold this into one extload instruction,
// with the hope of eliminating some of the other later "and" instructions using
// the loaded value. "and"s that are made trivially redundant by the insertion
// of the new "and" are removed by this function, while others (e.g. those whose
// path from the load goes through a phi) are left for isel to potentially
// remove.
//
// For example:
//
// b0:
// x = load i32
// ...
// b1:
// y = and x, 0xff
// z = use y
//
// becomes:
//
// b0:
// x = load i32
// x' = and x, 0xff
// ...
// b1:
// z = use x'
//
// whereas:
//
// b0:
// x1 = load i32
// ...
// b1:
// x2 = load i32
// ...
// b2:
// x = phi x1, x2
// y = and x, 0xff
//
// becomes (after a call to optimizeLoadExt for each load):
//
// b0:
// x1 = load i32
// x1' = and x1, 0xff
// ...
// b1:
// x2 = load i32
// x2' = and x2, 0xff
// ...
// b2:
// x = phi x1', x2'
// y = and x, 0xff
//
bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
if (!Load->isSimple() ||
!(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
return false;
// Skip loads we've already transformed or have no reason to transform.
if (Load->hasOneUse()) {
User *LoadUser = *Load->user_begin();
if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
!dyn_cast<PHINode>(LoadUser))
return false;
}
// Look at all uses of Load, looking through phis, to determine how many bits
// of the loaded value are needed.
SmallVector<Instruction *, 8> WorkList;
SmallPtrSet<Instruction *, 16> Visited;
SmallVector<Instruction *, 8> AndsToMaybeRemove;
for (auto *U : Load->users())
WorkList.push_back(cast<Instruction>(U));
EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
unsigned BitWidth = LoadResultVT.getSizeInBits();
APInt DemandBits(BitWidth, 0);
APInt WidestAndBits(BitWidth, 0);
while (!WorkList.empty()) {
Instruction *I = WorkList.back();
WorkList.pop_back();
// Break use-def graph loops.
if (!Visited.insert(I).second)
continue;
// For a PHI node, push all of its users.
if (auto *Phi = dyn_cast<PHINode>(I)) {
for (auto *U : Phi->users())
WorkList.push_back(cast<Instruction>(U));
continue;
}
switch (I->getOpcode()) {
case llvm::Instruction::And: {
auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
if (!AndC)
return false;
APInt AndBits = AndC->getValue();
DemandBits |= AndBits;
// Keep track of the widest and mask we see.
if (AndBits.ugt(WidestAndBits))
WidestAndBits = AndBits;
if (AndBits == WidestAndBits && I->getOperand(0) == Load)
AndsToMaybeRemove.push_back(I);
break;
}
case llvm::Instruction::Shl: {
auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
if (!ShlC)
return false;
uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt);
DemandBits |= ShlDemandBits;
break;
}
case llvm::Instruction::Trunc: {
EVT TruncVT = TLI->getValueType(*DL, I->getType());
unsigned TruncBitWidth = TruncVT.getSizeInBits();
auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth);
DemandBits |= TruncBits;
break;
}
default:
return false;
}
}
uint32_t ActiveBits = DemandBits.getActiveBits();
// Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
// target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example,
// for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
// (and (load x) 1) is not matched as a single instruction, rather as a LDR
// followed by an AND.
// TODO: Look into removing this restriction by fixing backends to either
// return false for isLoadExtLegal for i1 or have them select this pattern to
// a single instruction.
//
// Also avoid hoisting if we didn't see any ands with the exact DemandBits
// mask, since these are the only ands that will be removed by isel.
if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
WidestAndBits != DemandBits)
return false;
LLVMContext &Ctx = Load->getType()->getContext();
Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
EVT TruncVT = TLI->getValueType(*DL, TruncTy);
// Reject cases that won't be matched as extloads.
if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
!TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
return false;
IRBuilder<> Builder(Load->getNextNode());
auto *NewAnd = dyn_cast<Instruction>(
Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
// Replace all uses of load with new and (except for the use of load in the
// new and itself).
Load->replaceAllUsesWith(NewAnd);
NewAnd->setOperand(0, Load);
// Remove any and instructions that are now redundant.
for (auto *And : AndsToMaybeRemove)
// Check that the and mask is the same as the one we decided to put on the
// new and.
if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
And->replaceAllUsesWith(NewAnd);
if (&*CurInstIterator == And)
CurInstIterator = std::next(And->getIterator());
And->eraseFromParent();
++NumAndUses;
}
++NumAndsAdded;
return true;
}
/// Check if V (an operand of a select instruction) is an expensive instruction
/// that is only used once.
static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
@ -4957,8 +5144,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
stripInvariantGroupMetadata(*LI);
if (TLI) {
bool Modified = optimizeLoadExt(LI);
unsigned AS = LI->getPointerAddressSpace();
return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
return Modified;
}
return false;
}

View File

@ -26,3 +26,46 @@ define void @test_free_zext2(i32* %ptr, i32* %dst1, i64* %dst2) {
store i64 %load64, i64* %dst2, align 8
ret void
}
; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
; feeding a phi that zext's each loaded value.
define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) {
; CHECK-LABEL: test_free_zext3:
bb1:
; CHECK: ldrh [[REG:w[0-9]+]]
; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
%tmp1 = load i32, i32* %ptr, align 4
%cmp = icmp ne i32 %c, 0
br i1 %cmp, label %bb2, label %bb3
bb2:
; CHECK: ldrh [[REG2:w[0-9]+]]
; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff
%tmp2 = load i32, i32* %ptr2, align 4
br label %bb3
bb3:
%tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
%tmpand = and i32 %tmp3, 65535
ret i32 %tmpand
}
; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
; load feeding a phi in the same block.
define void @test_free_zext4(i32* %ptr, i32* %ptr2, i32* %dst) {
; CHECK-LABEL: test_free_zext4:
; CHECK: ldrh [[REG:w[0-9]+]]
; TODO: fix isel to remove final and XCHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
; CHECK: ldrh [[REG:w[0-9]+]]
bb1:
%load1 = load i32, i32* %ptr, align 4
br label %loop
loop:
%phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
%and = and i32 %phi, 65535
store i32 %and, i32* %dst, align 4
%load2 = load i32, i32* %ptr2, align 4
%cmp = icmp ne i32 %and, 0
br i1 %cmp, label %loop, label %end
end:
ret void
}

View File

@ -0,0 +1,82 @@
; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck %s
; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
; feeding a phi that zext's each loaded value.
define i32 @test_free_zext(i32* %ptr, i32* %ptr2, i32 %c) {
; CHECK-LABEL: @test_free_zext(
bb1:
; CHECK-LABEL: bb1:
; CHECK: %[[T1:.*]] = load
; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
%load1 = load i32, i32* %ptr, align 4
%cmp = icmp ne i32 %c, 0
br i1 %cmp, label %bb2, label %bb3
bb2:
; CHECK-LABEL: bb2:
; CHECK: %[[T2:.*]] = load
; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535
%load2 = load i32, i32* %ptr2, align 4
br label %bb3
bb3:
; CHECK-LABEL: bb3:
; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %bb2 ]
%phi = phi i32 [ %load1, %bb1 ], [ %load2, %bb2 ]
%and = and i32 %phi, 65535
ret i32 %and
}
; Test for CodeGenPrepare::optimizeLoadExt(): exercise all opcode
; cases of active bit calculation.
define i32 @test_free_zext2(i32* %ptr, i16* %dst16, i32* %dst32, i32 %c) {
; CHECK-LABEL: @test_free_zext2(
bb1:
; CHECK-LABEL: bb1:
; CHECK: %[[T1:.*]] = load
; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
%load1 = load i32, i32* %ptr, align 4
%cmp = icmp ne i32 %c, 0
br i1 %cmp, label %bb2, label %bb4
bb2:
; CHECK-LABEL: bb2:
%trunc = trunc i32 %load1 to i16
store i16 %trunc, i16* %dst16, align 2
br i1 %cmp, label %bb3, label %bb4
bb3:
; CHECK-LABEL: bb3:
%shl = shl i32 %load1, 16
store i32 %shl, i32* %dst32, align 4
br label %bb4
bb4:
; CHECK-LABEL: bb4:
; CHECK-NOT: and
; CHECK: ret i32 %[[A1]]
%and = and i32 %load1, 65535
ret i32 %and
}
; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
; load feeding a phi in the same block.
define void @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i64* %c) {
; CHECK-LABEL: @test_free_zext3(
bb1:
; CHECK-LABEL: bb1:
; CHECK: %[[T1:.*]] = load
; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
%load1 = load i32, i32* %ptr, align 4
br label %loop
loop:
; CHECK-LABEL: loop:
; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %loop ]
%phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
%and = and i32 %phi, 65535
store i32 %and, i32* %dst, align 4
%idx = load volatile i64, i64* %c, align 4
%addr = getelementptr inbounds i32, i32* %ptr2, i64 %idx
; CHECK: %[[T2:.*]] = load i32
; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535
%load2 = load i32, i32* %addr, align 4
%cmp = icmp ne i64 %idx, 0
br i1 %cmp, label %loop, label %end
end:
ret void
}