mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-23 19:59:57 +00:00
[CodeGenPrepare] Create more extloads and fewer ands
Summary: Add and instructions immediately after loads that only have their low bits used, assuming that the (and (load x) c) will be matched as a extload and the ands/truncs fed by the extload will be removed by isel. Reviewers: mcrosier, qcolombet, ab Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D14584 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253722 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d43d18c48b
commit
2d2aadfa70
@ -64,6 +64,9 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
|
||||
"computations were sunk");
|
||||
STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");
|
||||
STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");
|
||||
STATISTIC(NumAndsAdded,
|
||||
"Number of and mask instructions added to form ext loads");
|
||||
STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
|
||||
STATISTIC(NumRetsDup, "Number of return instructions duplicated");
|
||||
STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
|
||||
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
|
||||
@ -173,6 +176,7 @@ class TypePromotionTransaction;
|
||||
bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
|
||||
bool moveExtToFormExtLoad(Instruction *&I);
|
||||
bool optimizeExtUses(Instruction *I);
|
||||
bool optimizeLoadExt(LoadInst *I);
|
||||
bool optimizeSelectInst(SelectInst *SI);
|
||||
bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
|
||||
bool optimizeSwitchInst(SwitchInst *CI);
|
||||
@ -4256,6 +4260,189 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
|
||||
return MadeChange;
|
||||
}
|
||||
|
||||
// Find loads whose uses only use some of the loaded value's bits. Add an "and"
|
||||
// just after the load if the target can fold this into one extload instruction,
|
||||
// with the hope of eliminating some of the other later "and" instructions using
|
||||
// the loaded value. "and"s that are made trivially redundant by the insertion
|
||||
// of the new "and" are removed by this function, while others (e.g. those whose
|
||||
// path from the load goes through a phi) are left for isel to potentially
|
||||
// remove.
|
||||
//
|
||||
// For example:
|
||||
//
|
||||
// b0:
|
||||
// x = load i32
|
||||
// ...
|
||||
// b1:
|
||||
// y = and x, 0xff
|
||||
// z = use y
|
||||
//
|
||||
// becomes:
|
||||
//
|
||||
// b0:
|
||||
// x = load i32
|
||||
// x' = and x, 0xff
|
||||
// ...
|
||||
// b1:
|
||||
// z = use x'
|
||||
//
|
||||
// whereas:
|
||||
//
|
||||
// b0:
|
||||
// x1 = load i32
|
||||
// ...
|
||||
// b1:
|
||||
// x2 = load i32
|
||||
// ...
|
||||
// b2:
|
||||
// x = phi x1, x2
|
||||
// y = and x, 0xff
|
||||
//
|
||||
// becomes (after a call to optimizeLoadExt for each load):
|
||||
//
|
||||
// b0:
|
||||
// x1 = load i32
|
||||
// x1' = and x1, 0xff
|
||||
// ...
|
||||
// b1:
|
||||
// x2 = load i32
|
||||
// x2' = and x2, 0xff
|
||||
// ...
|
||||
// b2:
|
||||
// x = phi x1', x2'
|
||||
// y = and x, 0xff
|
||||
//
|
||||
|
||||
bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
|
||||
|
||||
if (!Load->isSimple() ||
|
||||
!(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
|
||||
return false;
|
||||
|
||||
// Skip loads we've already transformed or have no reason to transform.
|
||||
if (Load->hasOneUse()) {
|
||||
User *LoadUser = *Load->user_begin();
|
||||
if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
|
||||
!dyn_cast<PHINode>(LoadUser))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look at all uses of Load, looking through phis, to determine how many bits
|
||||
// of the loaded value are needed.
|
||||
SmallVector<Instruction *, 8> WorkList;
|
||||
SmallPtrSet<Instruction *, 16> Visited;
|
||||
SmallVector<Instruction *, 8> AndsToMaybeRemove;
|
||||
for (auto *U : Load->users())
|
||||
WorkList.push_back(cast<Instruction>(U));
|
||||
|
||||
EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
|
||||
unsigned BitWidth = LoadResultVT.getSizeInBits();
|
||||
APInt DemandBits(BitWidth, 0);
|
||||
APInt WidestAndBits(BitWidth, 0);
|
||||
|
||||
while (!WorkList.empty()) {
|
||||
Instruction *I = WorkList.back();
|
||||
WorkList.pop_back();
|
||||
|
||||
// Break use-def graph loops.
|
||||
if (!Visited.insert(I).second)
|
||||
continue;
|
||||
|
||||
// For a PHI node, push all of its users.
|
||||
if (auto *Phi = dyn_cast<PHINode>(I)) {
|
||||
for (auto *U : Phi->users())
|
||||
WorkList.push_back(cast<Instruction>(U));
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (I->getOpcode()) {
|
||||
case llvm::Instruction::And: {
|
||||
auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
|
||||
if (!AndC)
|
||||
return false;
|
||||
APInt AndBits = AndC->getValue();
|
||||
DemandBits |= AndBits;
|
||||
// Keep track of the widest and mask we see.
|
||||
if (AndBits.ugt(WidestAndBits))
|
||||
WidestAndBits = AndBits;
|
||||
if (AndBits == WidestAndBits && I->getOperand(0) == Load)
|
||||
AndsToMaybeRemove.push_back(I);
|
||||
break;
|
||||
}
|
||||
|
||||
case llvm::Instruction::Shl: {
|
||||
auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
|
||||
if (!ShlC)
|
||||
return false;
|
||||
uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
|
||||
auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt);
|
||||
DemandBits |= ShlDemandBits;
|
||||
break;
|
||||
}
|
||||
|
||||
case llvm::Instruction::Trunc: {
|
||||
EVT TruncVT = TLI->getValueType(*DL, I->getType());
|
||||
unsigned TruncBitWidth = TruncVT.getSizeInBits();
|
||||
auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth);
|
||||
DemandBits |= TruncBits;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t ActiveBits = DemandBits.getActiveBits();
|
||||
// Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
|
||||
// target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example,
|
||||
// for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
|
||||
// (and (load x) 1) is not matched as a single instruction, rather as a LDR
|
||||
// followed by an AND.
|
||||
// TODO: Look into removing this restriction by fixing backends to either
|
||||
// return false for isLoadExtLegal for i1 or have them select this pattern to
|
||||
// a single instruction.
|
||||
//
|
||||
// Also avoid hoisting if we didn't see any ands with the exact DemandBits
|
||||
// mask, since these are the only ands that will be removed by isel.
|
||||
if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
|
||||
WidestAndBits != DemandBits)
|
||||
return false;
|
||||
|
||||
LLVMContext &Ctx = Load->getType()->getContext();
|
||||
Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
|
||||
EVT TruncVT = TLI->getValueType(*DL, TruncTy);
|
||||
|
||||
// Reject cases that won't be matched as extloads.
|
||||
if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
|
||||
!TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
|
||||
return false;
|
||||
|
||||
IRBuilder<> Builder(Load->getNextNode());
|
||||
auto *NewAnd = dyn_cast<Instruction>(
|
||||
Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
|
||||
|
||||
// Replace all uses of load with new and (except for the use of load in the
|
||||
// new and itself).
|
||||
Load->replaceAllUsesWith(NewAnd);
|
||||
NewAnd->setOperand(0, Load);
|
||||
|
||||
// Remove any and instructions that are now redundant.
|
||||
for (auto *And : AndsToMaybeRemove)
|
||||
// Check that the and mask is the same as the one we decided to put on the
|
||||
// new and.
|
||||
if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
|
||||
And->replaceAllUsesWith(NewAnd);
|
||||
if (&*CurInstIterator == And)
|
||||
CurInstIterator = std::next(And->getIterator());
|
||||
And->eraseFromParent();
|
||||
++NumAndUses;
|
||||
}
|
||||
|
||||
++NumAndsAdded;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Check if V (an operand of a select instruction) is an expensive instruction
|
||||
/// that is only used once.
|
||||
static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
|
||||
@ -4957,8 +5144,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
|
||||
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
|
||||
stripInvariantGroupMetadata(*LI);
|
||||
if (TLI) {
|
||||
bool Modified = optimizeLoadExt(LI);
|
||||
unsigned AS = LI->getPointerAddressSpace();
|
||||
return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
|
||||
Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
|
||||
return Modified;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -26,3 +26,46 @@ define void @test_free_zext2(i32* %ptr, i32* %dst1, i64* %dst2) {
|
||||
store i64 %load64, i64* %dst2, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
|
||||
; feeding a phi that zext's each loaded value.
|
||||
define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) {
|
||||
; CHECK-LABEL: test_free_zext3:
|
||||
bb1:
|
||||
; CHECK: ldrh [[REG:w[0-9]+]]
|
||||
; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
|
||||
%tmp1 = load i32, i32* %ptr, align 4
|
||||
%cmp = icmp ne i32 %c, 0
|
||||
br i1 %cmp, label %bb2, label %bb3
|
||||
bb2:
|
||||
; CHECK: ldrh [[REG2:w[0-9]+]]
|
||||
; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff
|
||||
%tmp2 = load i32, i32* %ptr2, align 4
|
||||
br label %bb3
|
||||
bb3:
|
||||
%tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
|
||||
; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
|
||||
%tmpand = and i32 %tmp3, 65535
|
||||
ret i32 %tmpand
|
||||
}
|
||||
|
||||
; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
|
||||
; load feeding a phi in the same block.
|
||||
define void @test_free_zext4(i32* %ptr, i32* %ptr2, i32* %dst) {
|
||||
; CHECK-LABEL: test_free_zext4:
|
||||
; CHECK: ldrh [[REG:w[0-9]+]]
|
||||
; TODO: fix isel to remove final and XCHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
|
||||
; CHECK: ldrh [[REG:w[0-9]+]]
|
||||
bb1:
|
||||
%load1 = load i32, i32* %ptr, align 4
|
||||
br label %loop
|
||||
loop:
|
||||
%phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
|
||||
%and = and i32 %phi, 65535
|
||||
store i32 %and, i32* %dst, align 4
|
||||
%load2 = load i32, i32* %ptr2, align 4
|
||||
%cmp = icmp ne i32 %and, 0
|
||||
br i1 %cmp, label %loop, label %end
|
||||
end:
|
||||
ret void
|
||||
}
|
||||
|
82
test/Transforms/CodeGenPrepare/free-zext.ll
Normal file
82
test/Transforms/CodeGenPrepare/free-zext.ll
Normal file
@ -0,0 +1,82 @@
|
||||
; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck %s
|
||||
|
||||
; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
|
||||
; feeding a phi that zext's each loaded value.
|
||||
define i32 @test_free_zext(i32* %ptr, i32* %ptr2, i32 %c) {
|
||||
; CHECK-LABEL: @test_free_zext(
|
||||
bb1:
|
||||
; CHECK-LABEL: bb1:
|
||||
; CHECK: %[[T1:.*]] = load
|
||||
; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
|
||||
%load1 = load i32, i32* %ptr, align 4
|
||||
%cmp = icmp ne i32 %c, 0
|
||||
br i1 %cmp, label %bb2, label %bb3
|
||||
bb2:
|
||||
; CHECK-LABEL: bb2:
|
||||
; CHECK: %[[T2:.*]] = load
|
||||
; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535
|
||||
%load2 = load i32, i32* %ptr2, align 4
|
||||
br label %bb3
|
||||
bb3:
|
||||
; CHECK-LABEL: bb3:
|
||||
; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %bb2 ]
|
||||
%phi = phi i32 [ %load1, %bb1 ], [ %load2, %bb2 ]
|
||||
%and = and i32 %phi, 65535
|
||||
ret i32 %and
|
||||
}
|
||||
|
||||
; Test for CodeGenPrepare::optimizeLoadExt(): exercise all opcode
|
||||
; cases of active bit calculation.
|
||||
define i32 @test_free_zext2(i32* %ptr, i16* %dst16, i32* %dst32, i32 %c) {
|
||||
; CHECK-LABEL: @test_free_zext2(
|
||||
bb1:
|
||||
; CHECK-LABEL: bb1:
|
||||
; CHECK: %[[T1:.*]] = load
|
||||
; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
|
||||
%load1 = load i32, i32* %ptr, align 4
|
||||
%cmp = icmp ne i32 %c, 0
|
||||
br i1 %cmp, label %bb2, label %bb4
|
||||
bb2:
|
||||
; CHECK-LABEL: bb2:
|
||||
%trunc = trunc i32 %load1 to i16
|
||||
store i16 %trunc, i16* %dst16, align 2
|
||||
br i1 %cmp, label %bb3, label %bb4
|
||||
bb3:
|
||||
; CHECK-LABEL: bb3:
|
||||
%shl = shl i32 %load1, 16
|
||||
store i32 %shl, i32* %dst32, align 4
|
||||
br label %bb4
|
||||
bb4:
|
||||
; CHECK-LABEL: bb4:
|
||||
; CHECK-NOT: and
|
||||
; CHECK: ret i32 %[[A1]]
|
||||
%and = and i32 %load1, 65535
|
||||
ret i32 %and
|
||||
}
|
||||
|
||||
; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
|
||||
; load feeding a phi in the same block.
|
||||
define void @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i64* %c) {
|
||||
; CHECK-LABEL: @test_free_zext3(
|
||||
bb1:
|
||||
; CHECK-LABEL: bb1:
|
||||
; CHECK: %[[T1:.*]] = load
|
||||
; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
|
||||
%load1 = load i32, i32* %ptr, align 4
|
||||
br label %loop
|
||||
loop:
|
||||
; CHECK-LABEL: loop:
|
||||
; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %loop ]
|
||||
%phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
|
||||
%and = and i32 %phi, 65535
|
||||
store i32 %and, i32* %dst, align 4
|
||||
%idx = load volatile i64, i64* %c, align 4
|
||||
%addr = getelementptr inbounds i32, i32* %ptr2, i64 %idx
|
||||
; CHECK: %[[T2:.*]] = load i32
|
||||
; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535
|
||||
%load2 = load i32, i32* %addr, align 4
|
||||
%cmp = icmp ne i64 %idx, 0
|
||||
br i1 %cmp, label %loop, label %end
|
||||
end:
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user