[BypassSlowDivision] Refactor fast division insertion logic (NFC)

The most important goal of the patch is to break large insertFastDiv function
into separate pieces, so that later a different fast insertion logic can be
implemented using some of these pieces.

Differential Revision: https://reviews.llvm.org/D29896


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296828 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nikolai Bozhenov 2017-03-02 22:05:07 +00:00
parent 15497c13fd
commit 7c6958332a

View File

@ -36,12 +36,21 @@ namespace {
: SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
};
struct DivPhiNodes {
PHINode *Quotient;
PHINode *Remainder;
struct QuotRemPair {
Value *Quotient;
Value *Remainder;
DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
: Quotient(InQuotient), Remainder(InRemainder) {}
QuotRemPair(Value *InQuotient, Value *InRemainder)
: Quotient(InQuotient), Remainder(InRemainder) {}
};
/// A quotient and remainder, plus a BB from which they logically "originate".
/// If you use Quotient or Remainder in a Phi node, you should use BB as its
/// corresponding predecessor.
struct QuotRemWithBB {
BasicBlock *BB = nullptr;
Value *Quotient = nullptr;
Value *Remainder = nullptr;
};
}
@ -69,92 +78,174 @@ namespace llvm {
}
};
typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
}
// insertFastDiv - Substitutes the div/rem instruction with code that checks the
// value of the operands and uses a shorter-faster div/rem instruction when
// possible and the longer-slower div/rem instruction otherwise.
static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
bool UseDivOp, bool UseSignedOp,
DivCacheTy &PerBBDivCache) {
Function *F = I->getParent()->getParent();
// Get instruction operands
Value *Dividend = I->getOperand(0);
Value *Divisor = I->getOperand(1);
namespace {
class FastDivInsertionTask {
bool IsValidTask = false;
Instruction *SlowDivOrRem = nullptr;
IntegerType *BypassType = nullptr;
BasicBlock *MainBB = nullptr;
if (isa<ConstantInt>(Divisor)) {
// Division by a constant should have been been solved and replaced earlier
// in the pipeline.
return false;
QuotRemWithBB createSlowBB(BasicBlock *Successor);
QuotRemWithBB createFastBB(BasicBlock *Successor);
QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
BasicBlock *PhiBB);
Value *insertOperandRuntimeCheck();
Optional<QuotRemPair> insertFastDivAndRem();
bool isSignedOp() {
return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
SlowDivOrRem->getOpcode() == Instruction::SRem;
}
bool isDivisionOp() {
return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
SlowDivOrRem->getOpcode() == Instruction::UDiv;
}
Type *getSlowType() { return SlowDivOrRem->getType(); }
public:
FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
Value *getReplacement(DivCacheTy &Cache);
};
} // anonymous namespace
FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
const BypassWidthsTy &BypassWidths) {
switch (I->getOpcode()) {
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
SlowDivOrRem = I;
break;
default:
// I is not a div/rem operation.
return;
}
// If the numerator is a constant, bail if it doesn't fit into BypassType.
if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
return false;
// Skip division on vector types. Only optimize integer instructions.
IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
if (!SlowType)
return;
// Basic Block is split before divide
BasicBlock *MainBB = &*I->getParent();
BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I);
// Skip if this bitwidth is not bypassed.
auto BI = BypassWidths.find(SlowType->getBitWidth());
if (BI == BypassWidths.end())
return;
// Add new basic block for slow divide operation
BasicBlock *SlowBB =
BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
SlowBB->moveBefore(SuccessorBB);
IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
Value *SlowQuotientV;
Value *SlowRemainderV;
if (UseSignedOp) {
SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
// Get type for div/rem instruction with bypass bitwidth.
IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
BypassType = BT;
// The original basic block.
MainBB = I->getParent();
// The instruction is indeed a slow div or rem operation.
IsValidTask = true;
}
/// Reuses previously-computed dividend or remainder from the current BB if
/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
/// perform the optimization and caches the resulting dividend and remainder.
/// If no replacement can be generated, nullptr is returned.
Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
// First, make sure that the task is valid.
if (!IsValidTask)
return nullptr;
// Then, look for a value in Cache.
Value *Dividend = SlowDivOrRem->getOperand(0);
Value *Divisor = SlowDivOrRem->getOperand(1);
DivOpInfo Key(isSignedOp(), Dividend, Divisor);
auto CacheI = Cache.find(Key);
if (CacheI == Cache.end()) {
// If previous instance does not exist, try to insert fast div.
Optional<QuotRemPair> OptResult = insertFastDivAndRem();
// Bail out if insertFastDivAndRem has failed.
if (!OptResult)
return nullptr;
CacheI = Cache.insert({Key, *OptResult}).first;
}
QuotRemPair &Value = CacheI->second;
return isDivisionOp() ? Value.Quotient : Value.Remainder;
}
/// Add new basic block for slow div and rem operations and put it before
/// SuccessorBB.
QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
QuotRemWithBB DivRemPair;
DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
MainBB->getParent(), SuccessorBB);
IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
Value *Dividend = SlowDivOrRem->getOperand(0);
Value *Divisor = SlowDivOrRem->getOperand(1);
if (isSignedOp()) {
DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
} else {
SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
}
SlowBuilder.CreateBr(SuccessorBB);
// Add new basic block for fast divide operation
BasicBlock *FastBB =
BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
FastBB->moveBefore(SlowBB);
IRBuilder<> FastBuilder(FastBB, FastBB->begin());
Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
BypassType);
Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
BypassType);
Builder.CreateBr(SuccessorBB);
return DivRemPair;
}
// udiv/urem because optimization only handles positive numbers
Value *ShortQuotientV = FastBuilder.CreateUDiv(ShortDividendV, ShortDivisorV);
Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
ShortDivisorV);
Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
ShortQuotientV,
Dividend->getType());
Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
ShortRemainderV,
Dividend->getType());
FastBuilder.CreateBr(SuccessorBB);
/// Add new basic block for fast div and rem operations and put it before
/// SuccessorBB.
QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
QuotRemWithBB DivRemPair;
DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
MainBB->getParent(), SuccessorBB);
IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
// Phi nodes for result of div and rem
IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
PHINode *QuoPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
QuoPhi->addIncoming(SlowQuotientV, SlowBB);
QuoPhi->addIncoming(FastQuotientV, FastBB);
PHINode *RemPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
RemPhi->addIncoming(SlowRemainderV, SlowBB);
RemPhi->addIncoming(FastRemainderV, FastBB);
Value *Dividend = SlowDivOrRem->getOperand(0);
Value *Divisor = SlowDivOrRem->getOperand(1);
Value *ShortDivisorV =
Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
Value *ShortDividendV =
Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
// Replace I with appropriate phi node
if (UseDivOp)
I->replaceAllUsesWith(QuoPhi);
else
I->replaceAllUsesWith(RemPhi);
I->eraseFromParent();
// udiv/urem because this optimization only handles positive numbers.
Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
DivRemPair.Quotient =
Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
DivRemPair.Remainder =
Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
Builder.CreateBr(SuccessorBB);
// Combine operands into a single value with OR for value testing below
MainBB->getInstList().back().eraseFromParent();
IRBuilder<> MainBuilder(MainBB, MainBB->end());
return DivRemPair;
}
/// Creates Phi nodes for result of Div and Rem.
QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
QuotRemWithBB &RHS,
BasicBlock *PhiBB) {
IRBuilder<> Builder(PhiBB, PhiBB->begin());
PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
RemPhi->addIncoming(LHS.Remainder, LHS.BB);
RemPhi->addIncoming(RHS.Remainder, RHS.BB);
return QuotRemPair(QuoPhi, RemPhi);
}
/// Creates a runtime check to test whether both the divisor and dividend fit
/// into BypassType. The check is inserted at the end of MainBB. True return
/// value means that the operands fit.
Value *FastDivInsertionTask::insertOperandRuntimeCheck() {
IRBuilder<> Builder(MainBB, MainBB->end());
Value *Dividend = SlowDivOrRem->getOperand(0);
Value *Divisor = SlowDivOrRem->getOperand(1);
// We should have bailed out above if the divisor is a constant, but the
// dividend may still be a constant. Set OrV to our non-constant operands
@ -163,65 +254,54 @@ static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
Value *OrV;
if (!isa<ConstantInt>(Dividend))
OrV = MainBuilder.CreateOr(Dividend, Divisor);
OrV = Builder.CreateOr(Dividend, Divisor);
else
OrV = Divisor;
// BitMask is inverted to check if the operands are
// larger than the bypass type
uint64_t BitMask = ~BypassType->getBitMask();
Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
Value *AndV = Builder.CreateAnd(OrV, BitMask);
// Compare operand values and branch
Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
// Cache phi nodes to be used later in place of other instances
// of div or rem with the same sign, dividend, and divisor
DivOpInfo Key(UseSignedOp, Dividend, Divisor);
DivPhiNodes Value(QuoPhi, RemPhi);
PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
return true;
// Compare operand values
Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
return Builder.CreateICmpEQ(AndV, ZeroV);
}
// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder from
// the current BB if operands and operation are identical. Otherwise calls
// insertFastDiv to perform the optimization and caches the resulting dividend
// and remainder.
static bool reuseOrInsertFastDiv(Instruction *I, IntegerType *BypassType,
bool UseDivOp, bool UseSignedOp,
DivCacheTy &PerBBDivCache) {
// Get instruction operands
DivOpInfo Key(UseSignedOp, I->getOperand(0), I->getOperand(1));
DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
/// Substitutes the div/rem instruction with code that checks the value of the
/// operands and uses a shorter-faster div/rem instruction when possible.
Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
Value *Dividend = SlowDivOrRem->getOperand(0);
Value *Divisor = SlowDivOrRem->getOperand(1);
if (CacheI == PerBBDivCache.end()) {
// If previous instance does not exist, insert fast div
return insertFastDiv(I, BypassType, UseDivOp, UseSignedOp, PerBBDivCache);
if (isa<ConstantInt>(Divisor)) {
// Keep division by a constant for DAGCombiner.
return None;
}
// Replace operation value with previously generated phi node
DivPhiNodes &Value = CacheI->second;
if (UseDivOp) {
// Replace all uses of div instruction with quotient phi node
I->replaceAllUsesWith(Value.Quotient);
} else {
// Replace all uses of rem instruction with remainder phi node
I->replaceAllUsesWith(Value.Remainder);
}
// If the numerator is a constant, bail if it doesn't fit into BypassType.
if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
return None;
// Remove redundant operation
I->eraseFromParent();
return true;
// Split the basic block before the div/rem.
BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
// Remove the unconditional branch from MainBB to SuccessorBB.
MainBB->getInstList().back().eraseFromParent();
QuotRemWithBB Fast = createFastBB(SuccessorBB);
QuotRemWithBB Slow = createSlowBB(SuccessorBB);
QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
Value *CmpV = insertOperandRuntimeCheck();
IRBuilder<> Builder(MainBB, MainBB->end());
Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
return Result;
}
// bypassSlowDivision - This optimization identifies DIV instructions in a BB
// that can be profitably bypassed and carried out with a shorter, faster
// divide.
bool llvm::bypassSlowDivision(
BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidths) {
DivCacheTy DivCache;
/// This optimization identifies DIV/REM instructions in a BB that can be
/// profitably bypassed and carried out with a shorter, faster divide.
bool llvm::bypassSlowDivision(BasicBlock *BB,
const BypassWidthsTy &BypassWidths) {
DivCacheTy PerBBDivCache;
bool MadeChange = false;
Instruction* Next = &*BB->begin();
@ -231,42 +311,20 @@ bool llvm::bypassSlowDivision(
Instruction* I = Next;
Next = Next->getNextNode();
// Get instruction details
unsigned Opcode = I->getOpcode();
bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
bool UseSignedOp = Opcode == Instruction::SDiv ||
Opcode == Instruction::SRem;
// Only optimize div or rem ops
if (!UseDivOp && !UseRemOp)
continue;
// Skip division on vector types, only optimize integer instructions
if (!I->getType()->isIntegerTy())
continue;
// Get bitwidth of div/rem instruction
IntegerType *T = cast<IntegerType>(I->getType());
unsigned int bitwidth = T->getBitWidth();
// Continue if bitwidth is not bypassed
DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
if (BI == BypassWidths.end())
continue;
// Get type for div/rem instruction with bypass bitwidth
IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache);
FastDivInsertionTask Task(I, BypassWidths);
if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
I->replaceAllUsesWith(Replacement);
I->eraseFromParent();
MadeChange = true;
}
}
// Above we eagerly create divs and rems, as pairs, so that we can efficiently
// create divrem machine instructions. Now erase any unused divs / rems so we
// don't leave extra instructions sitting around.
for (auto &KV : DivCache)
for (Instruction *Phi : {KV.second.Quotient, KV.second.Remainder})
RecursivelyDeleteTriviallyDeadInstructions(Phi);
for (auto &KV : PerBBDivCache)
for (Value *V : {KV.second.Quotient, KV.second.Remainder})
RecursivelyDeleteTriviallyDeadInstructions(V);
return MadeChange;
}