diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index d775764b43bf..30623021f86d 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -129,6 +129,342 @@ INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom", "Recognize Hexagon-specific loop idioms", false, false) +namespace { + struct Simplifier { + typedef std::function Rule; + + void addRule(const Rule &R) { Rules.push_back(R); } + + private: + typedef std::deque WorkListType; + typedef std::set ValueSetType; + std::vector Rules; + + public: + struct Context { + typedef DenseMap ValueMapType; + + Value *Root; + ValueSetType Used; + ValueMapType Clones, Orig; + LLVMContext &Ctx; + + Context(Instruction *Exp) + : Ctx(Exp->getParent()->getParent()->getContext()) { + initialize(Exp); + reset(); + } + ~Context() { cleanup(); } + void print(raw_ostream &OS, const Value *V) const; + + Value *materialize(BasicBlock *B, BasicBlock::iterator At); + + private: + void initialize(Instruction *Exp); + void reset(); + void cleanup(); + void cleanup(Value *V); + + bool equal(const Instruction *I, const Instruction *J) const; + Value *find(Value *Tree, Value *Sub) const; + Value *subst(Value *Tree, Value *OldV, Value *NewV); + void replace(Value *OldV, Value *NewV); + void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At); + + friend struct Simplifier; + }; + + Value *simplify(Context &C); + }; + + struct PE { + PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {} + const Simplifier::Context &C; + const Value *V; + }; + + raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED; + raw_ostream &operator<< (raw_ostream &OS, const PE &P) { + P.C.print(OS, P.V ? P.V : P.C.Root); + return OS; + } +} + + +void Simplifier::Context::print(raw_ostream &OS, const Value *V) const { + const auto *U = dyn_cast(V); + if (!U) { + OS << V << '(' << *V << ')'; + return; + } + + if (U->getParent()) { + OS << U << '('; + U->printAsOperand(OS, true); + OS << ')'; + return; + } + + unsigned N = U->getNumOperands(); + if (N != 0) + OS << U << '('; + OS << U->getOpcodeName(); + for (const Value *Op : U->operands()) { + OS << ' '; + print(OS, Op); + } + if (N != 0) + OS << ')'; +} + + +void Simplifier::Context::initialize(Instruction *Exp) { + // Perform a deep clone of the expression, set Root to the root + // of the clone, and build a map from the cloned values to the + // original ones. + BasicBlock *Block = Exp->getParent(); + WorkListType Q; + Q.push_back(Exp); + + while (!Q.empty()) { + Value *V = Q.front(); + Q.pop_front(); + if (Clones.find(V) != Clones.end()) + continue; + if (Instruction *U = dyn_cast(V)) { + if (isa(U) || U->getParent() != Block) + continue; + for (Value *Op : U->operands()) + Q.push_back(Op); + Clones.insert({U, U->clone()}); + } + } + + for (std::pair P : Clones) { + Instruction *U = cast(P.second); + for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) { + auto F = Clones.find(U->getOperand(i)); + if (F != Clones.end()) + U->setOperand(i, F->second); + } + Orig.insert({P.second, P.first}); + } + + auto R = Clones.find(Exp); + assert(R != Clones.end()); + Root = R->second; +} + + +void Simplifier::Context::reset() { + ValueSetType NewUsed; + WorkListType Q; + Q.push_back(Root); + + while (!Q.empty()) { + Instruction *U = dyn_cast(Q.front()); + Q.pop_front(); + if (!U || U->getParent()) + continue; + NewUsed.insert(U); + for (Value *Op : U->operands()) + Q.push_back(Op); + } + for (Value *V : Used) + if (!NewUsed.count(V)) + cast(V)->dropAllReferences(); + Used = NewUsed; +} + + +Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) { + if (Tree == OldV) { + cleanup(OldV); + return NewV; + } + + WorkListType Q; + Q.push_back(Tree); + while (!Q.empty()) { + Instruction *U = dyn_cast(Q.front()); + Q.pop_front(); + // If U is not an instruction, or it's not a clone, skip it. + if (!U || U->getParent()) + continue; + for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) { + Value *Op = U->getOperand(i); + if (Op == OldV) { + cleanup(OldV); + U->setOperand(i, NewV); + } else { + Q.push_back(Op); + } + } + } + return Tree; +} + + +void Simplifier::Context::replace(Value *OldV, Value *NewV) { + if (Root == OldV) { + Root = NewV; + reset(); + return; + } + + // NewV may be a complex tree that has just been created by one of the + // transformation rules. We need to make sure that it is commoned with + // the existing Root to the maximum extent possible. + // Identify all subtrees of NewV (including NewV itself) that have + // equivalent counterparts in Root, and replace those subtrees with + // these counterparts. + WorkListType Q; + Q.push_back(NewV); + while (!Q.empty()) { + Value *V = Q.front(); + Q.pop_front(); + Instruction *U = dyn_cast(V); + if (!U || U->getParent()) + continue; + if (Value *DupV = find(Root, V)) { + if (DupV != V) + NewV = subst(NewV, V, DupV); + } else { + for (Value *Op : U->operands()) + Q.push_back(Op); + } + } + + // Now, simply replace OldV with NewV in Root. + Root = subst(Root, OldV, NewV); + reset(); +} + + +void Simplifier::Context::cleanup() { + for (Value *V : Used) { + Instruction *U = cast(V); + if (!U->getParent()) + U->dropAllReferences(); + } +} + + +void Simplifier::Context::cleanup(Value *V) { + if (!isa(V) || cast(V)->getParent() != nullptr) + return; + WorkListType Q; + Q.push_back(V); + while (!Q.empty()) { + Instruction *U = dyn_cast(Q.front()); + Q.pop_front(); + if (!U || U->getParent() || Used.count(U)) + continue; + for (Value *Op : U->operands()) + Q.push_back(Op); + U->dropAllReferences(); + } +} + + +bool Simplifier::Context::equal(const Instruction *I, + const Instruction *J) const { + if (I == J) + return true; + if (!I->isSameOperationAs(J)) + return false; + if (isa(I)) + return I->isIdenticalTo(J); + + for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) { + Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i); + if (OpI == OpJ) + continue; + auto *InI = dyn_cast(OpI); + auto *InJ = dyn_cast(OpJ); + if (InI && InJ) { + if (!equal(InI, InJ)) + return false; + } else if (InI != InJ || !InI) + return false; + } + return true; +} + + +Value *Simplifier::Context::find(Value *Tree, Value *Sub) const { + Instruction *SubI = dyn_cast(Sub); + WorkListType Q; + Q.push_back(Tree); + + while (!Q.empty()) { + Value *V = Q.front(); + Q.pop_front(); + if (V == Sub) + return V; + Instruction *U = dyn_cast(V); + if (!U || U->getParent()) + continue; + if (SubI && equal(SubI, U)) + return U; + assert(!isa(U)); + for (Value *Op : U->operands()) + Q.push_back(Op); + } + return nullptr; +} + + +void Simplifier::Context::link(Instruction *I, BasicBlock *B, + BasicBlock::iterator At) { + if (I->getParent()) + return; + + for (Value *Op : I->operands()) { + if (Instruction *OpI = dyn_cast(Op)) + link(OpI, B, At); + } + + B->getInstList().insert(At, I); +} + + +Value *Simplifier::Context::materialize(BasicBlock *B, + BasicBlock::iterator At) { + if (Instruction *RootI = dyn_cast(Root)) + link(RootI, B, At); + return Root; +} + + +Value *Simplifier::simplify(Context &C) { + WorkListType Q; + Q.push_back(C.Root); + + while (!Q.empty()) { + Instruction *U = dyn_cast(Q.front()); + Q.pop_front(); + if (!U || U->getParent() || !C.Used.count(U)) + continue; + bool Changed = false; + for (Rule &R : Rules) { + Value *W = R(U, C.Ctx); + if (!W) + continue; + Changed = true; + C.replace(U, W); + Q.push_back(C.Root); + break; + } + if (!Changed) { + for (Value *Op : U->operands()) + Q.push_back(Op); + } + } + return C.Root; +} + + //===----------------------------------------------------------------------===// // // Implementation of PolynomialMultiplyRecognize @@ -147,6 +483,14 @@ namespace { private: typedef SetVector ValueSeq; + IntegerType *getPmpyType() const { + LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext(); + return IntegerType::get(Ctx, 32); + } + bool isPromotableTo(Value *V, IntegerType *Ty); + void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB); + bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB); + Value *getCountIV(BasicBlock *BB); bool findCycle(Value *Out, Value *In, ValueSeq &Cycle); void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early, @@ -176,6 +520,9 @@ namespace { unsigned getInverseMxN(unsigned QP); Value *generate(BasicBlock::iterator At, ParsedValues &PV); + void setupSimplifier(); + + Simplifier Simp; Loop *CurLoop; const DataLayout &DL; const DominatorTree &DT; @@ -425,7 +772,6 @@ bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI, BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV, bool PreScan) { using namespace PatternMatch; - // The basic pattern for R = P.Q is: // for i = 0..31 // R = phi (0, R') @@ -529,6 +875,150 @@ bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI, } +bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val, + IntegerType *DestTy) { + IntegerType *T = dyn_cast(Val->getType()); + if (!T || T->getBitWidth() > DestTy->getBitWidth()) + return false; + if (T->getBitWidth() == DestTy->getBitWidth()) + return true; + // Non-instructions are promotable. The reason why an instruction may not + // be promotable is that it may produce a different result if its operands + // and the result are promoted, for example, it may produce more non-zero + // bits. While it would still be possible to represent the proper result + // in a wider type, it may require adding additional instructions (which + // we don't want to do). + Instruction *In = dyn_cast(Val); + if (!In) + return true; + // The bitwidth of the source type is smaller than the destination. + // Check if the individual operation can be promoted. + switch (In->getOpcode()) { + case Instruction::PHI: + case Instruction::ZExt: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::LShr: // Shift right is ok. + case Instruction::Select: + return true; + case Instruction::ICmp: + if (CmpInst *CI = cast(In)) + return CI->isEquality() || CI->isUnsigned(); + llvm_unreachable("Cast failed unexpectedly"); + case Instruction::Add: + return In->hasNoSignedWrap() && In->hasNoUnsignedWrap(); + } + return false; +} + + +void PolynomialMultiplyRecognize::promoteTo(Instruction *In, + IntegerType *DestTy, BasicBlock *LoopB) { + // Leave boolean values alone. + if (!In->getType()->isIntegerTy(1)) + In->mutateType(DestTy); + unsigned DestBW = DestTy->getBitWidth(); + + // Handle PHIs. + if (PHINode *P = dyn_cast(In)) { + unsigned N = P->getNumIncomingValues(); + for (unsigned i = 0; i != N; ++i) { + BasicBlock *InB = P->getIncomingBlock(i); + if (InB == LoopB) + continue; + Value *InV = P->getIncomingValue(i); + IntegerType *Ty = cast(InV->getType()); + // Do not promote values in PHI nodes of type i1. + if (Ty != P->getType()) { + // If the value type does not match the PHI type, the PHI type + // must have been promoted. + assert(Ty->getBitWidth() < DestBW); + InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy); + P->setIncomingValue(i, InV); + } + } + } else if (ZExtInst *Z = dyn_cast(In)) { + Value *Op = Z->getOperand(0); + if (Op->getType() == Z->getType()) + Z->replaceAllUsesWith(Op); + Z->eraseFromParent(); + return; + } + + // Promote immediates. + for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) { + if (ConstantInt *CI = dyn_cast(In->getOperand(i))) + if (CI->getType()->getBitWidth() < DestBW) + In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue())); + } +} + + +bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB, + BasicBlock *ExitB) { + assert(LoopB); + // Skip loops where the exit block has more than one predecessor. The values + // coming from the loop block will be promoted to another type, and so the + // values coming into the exit block from other predecessors would also have + // to be promoted. + if (!ExitB || (ExitB->getSinglePredecessor() != LoopB)) + return false; + IntegerType *DestTy = getPmpyType(); + // Check if the exit values have types that are no wider than the type + // that we want to promote to. + unsigned DestBW = DestTy->getBitWidth(); + for (Instruction &In : *ExitB) { + PHINode *P = dyn_cast(&In); + if (!P) + break; + if (P->getNumIncomingValues() != 1) + return false; + assert(P->getIncomingBlock(0) == LoopB); + IntegerType *T = dyn_cast(P->getType()); + if (!T || T->getBitWidth() > DestBW) + return false; + } + + // Check all instructions in the loop. + for (Instruction &In : *LoopB) + if (!In.isTerminator() && !isPromotableTo(&In, DestTy)) + return false; + + // Perform the promotion. + std::vector LoopIns; + std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns), + [](Instruction &In) { return &In; }); + for (Instruction *In : LoopIns) + promoteTo(In, DestTy, LoopB); + + // Fix up the PHI nodes in the exit block. + Instruction *EndI = ExitB->getFirstNonPHI(); + BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end(); + for (auto I = ExitB->begin(); I != End; ++I) { + PHINode *P = dyn_cast(I); + if (!P) + break; + Type *Ty0 = P->getIncomingValue(0)->getType(); + Type *PTy = P->getType(); + if (PTy != Ty0) { + assert(Ty0 == DestTy); + // In order to create the trunc, P must have the promoted type. + P->mutateType(Ty0); + Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy); + // In order for the RAUW to work, the types of P and T must match. + P->mutateType(PTy); + P->replaceAllUsesWith(T); + // Final update of the P's type. + P->mutateType(Ty0); + cast(T)->setOperand(0, P); + } + } + + return true; +} + + bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In, ValueSeq &Cycle) { // Out = ..., In, ... @@ -699,6 +1189,7 @@ bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V, case Instruction::Select: case Instruction::ICmp: case Instruction::PHI: + case Instruction::ZExt: return true; } } @@ -985,13 +1476,170 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At, } +void PolynomialMultiplyRecognize::setupSimplifier() { + Simp.addRule( + // Sink zext past bitwise operations. + [](Instruction *I, LLVMContext &Ctx) -> Value* { + if (I->getOpcode() != Instruction::ZExt) + return nullptr; + Instruction *T = dyn_cast(I->getOperand(0)); + if (!T) + return nullptr; + switch (T->getOpcode()) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + break; + default: + return nullptr; + } + IRBuilder<> B(Ctx); + return B.CreateBinOp(cast(T)->getOpcode(), + B.CreateZExt(T->getOperand(0), I->getType()), + B.CreateZExt(T->getOperand(1), I->getType())); + }); + Simp.addRule( + // (xor (and x a) (and y a)) -> (and (xor x y) a) + [](Instruction *I, LLVMContext &Ctx) -> Value* { + if (I->getOpcode() != Instruction::Xor) + return nullptr; + Instruction *And0 = dyn_cast(I->getOperand(0)); + Instruction *And1 = dyn_cast(I->getOperand(1)); + if (!And0 || !And1) + return nullptr; + if (And0->getOpcode() != Instruction::And || + And1->getOpcode() != Instruction::And) + return nullptr; + if (And0->getOperand(1) != And1->getOperand(1)) + return nullptr; + IRBuilder<> B(Ctx); + return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)), + And0->getOperand(1)); + }); + Simp.addRule( + // (Op (select c x y) z) -> (select c (Op x z) (Op y z)) + // (Op x (select c y z)) -> (select c (Op x y) (Op x z)) + [](Instruction *I, LLVMContext &Ctx) -> Value* { + BinaryOperator *BO = dyn_cast(I); + if (!BO) + return nullptr; + Instruction::BinaryOps Op = BO->getOpcode(); + if (SelectInst *Sel = dyn_cast(BO->getOperand(0))) { + IRBuilder<> B(Ctx); + Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue(); + Value *Z = BO->getOperand(1); + return B.CreateSelect(Sel->getCondition(), + B.CreateBinOp(Op, X, Z), + B.CreateBinOp(Op, Y, Z)); + } + if (SelectInst *Sel = dyn_cast(BO->getOperand(1))) { + IRBuilder<> B(Ctx); + Value *X = BO->getOperand(0); + Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue(); + return B.CreateSelect(Sel->getCondition(), + B.CreateBinOp(Op, X, Y), + B.CreateBinOp(Op, X, Z)); + } + return nullptr; + }); + Simp.addRule( + // (select c (select c x y) z) -> (select c x z) + // (select c x (select c y z)) -> (select c x z) + [](Instruction *I, LLVMContext &Ctx) -> Value* { + SelectInst *Sel = dyn_cast(I); + if (!Sel) + return nullptr; + IRBuilder<> B(Ctx); + Value *C = Sel->getCondition(); + if (SelectInst *Sel0 = dyn_cast(Sel->getTrueValue())) { + if (Sel0->getCondition() == C) + return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue()); + } + if (SelectInst *Sel1 = dyn_cast(Sel->getFalseValue())) { + if (Sel1->getCondition() == C) + return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue()); + } + return nullptr; + }); + Simp.addRule( + // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0) + [](Instruction *I, LLVMContext &Ctx) -> Value* { + if (I->getOpcode() != Instruction::Or) + return nullptr; + Instruction *LShr = dyn_cast(I->getOperand(0)); + if (!LShr || LShr->getOpcode() != Instruction::LShr) + return nullptr; + ConstantInt *One = dyn_cast(LShr->getOperand(1)); + if (!One || One->getZExtValue() != 1) + return nullptr; + ConstantInt *Msb = dyn_cast(I->getOperand(1)); + if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit()) + return nullptr; + return IRBuilder<>(Ctx).CreateXor(LShr, Msb); + }); + Simp.addRule( + // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c)) + [](Instruction *I, LLVMContext &Ctx) -> Value* { + if (I->getOpcode() != Instruction::LShr) + return nullptr; + BinaryOperator *BitOp = dyn_cast(I->getOperand(0)); + if (!BitOp) + return nullptr; + switch (BitOp->getOpcode()) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + break; + default: + return nullptr; + } + IRBuilder<> B(Ctx); + Value *S = I->getOperand(1); + return B.CreateBinOp(BitOp->getOpcode(), + B.CreateLShr(BitOp->getOperand(0), S), + B.CreateLShr(BitOp->getOperand(1), S)); + }); + Simp.addRule( + // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b)) + [](Instruction *I, LLVMContext &Ctx) -> Value* { + auto IsBitOp = [](unsigned Op) -> bool { + switch (Op) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return true; + } + return false; + }; + BinaryOperator *BitOp1 = dyn_cast(I); + if (!BitOp1 || !IsBitOp(BitOp1->getOpcode())) + return nullptr; + BinaryOperator *BitOp2 = dyn_cast(BitOp1->getOperand(0)); + if (!BitOp2 || !IsBitOp(BitOp2->getOpcode())) + return nullptr; + ConstantInt *CA = dyn_cast(BitOp2->getOperand(1)); + ConstantInt *CB = dyn_cast(BitOp1->getOperand(1)); + if (!CA || !CB) + return nullptr; + IRBuilder<> B(Ctx); + Value *X = BitOp2->getOperand(0); + return B.CreateBinOp(BitOp2->getOpcode(), X, + B.CreateBinOp(BitOp1->getOpcode(), CA, CB)); + }); +} + + bool PolynomialMultiplyRecognize::recognize() { + DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n" + << *CurLoop << '\n'); // Restrictions: // - The loop must consist of a single block. // - The iteration count must be known at compile-time. // - The loop must have an induction variable starting from 0, and // incremented in each iteration of the loop. BasicBlock *LoopB = CurLoop->getHeader(); + DEBUG(dbgs() << "Loop header:\n" << *LoopB); + if (LoopB != CurLoop->getLoopLatch()) return false; BasicBlock *ExitB = CurLoop->getExitBlock(); @@ -1011,30 +1659,65 @@ bool PolynomialMultiplyRecognize::recognize() { Value *CIV = getCountIV(LoopB); ParsedValues PV; PV.IterCount = IterCount; + DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n'); - // Test function to see if a given select instruction is a part of the - // pmpy pattern. The argument PreScan set to "true" indicates that only - // a preliminary scan is needed, "false" indicated an exact match. - auto CouldBePmpy = [this, LoopB, EntryB, CIV, &PV] (bool PreScan) - -> std::function { - return [this, LoopB, EntryB, CIV, &PV, PreScan] (Instruction &I) -> bool { - if (auto *SelI = dyn_cast(&I)) - return scanSelect(SelI, LoopB, EntryB, CIV, PV, PreScan); - return false; - }; - }; - auto PreF = std::find_if(LoopB->begin(), LoopB->end(), CouldBePmpy(true)); - if (PreF == LoopB->end()) + setupSimplifier(); + + // Perform a preliminary scan of select instructions to see if any of them + // looks like a generator of the polynomial multiply steps. Assume that a + // loop can only contain a single transformable operation, so stop the + // traversal after the first reasonable candidate was found. + // XXX: Currently this approach can modify the loop before being 100% sure + // that the transformation can be carried out. + bool FoundPreScan = false; + for (Instruction &In : *LoopB) { + SelectInst *SI = dyn_cast(&In); + if (!SI) + continue; + + Simplifier::Context C(SI); + Value *T = Simp.simplify(C); + SelectInst *SelI = (T && isa(T)) ? cast(T) : SI; + DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n'); + if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) { + FoundPreScan = true; + if (SelI != SI) { + Value *NewSel = C.materialize(LoopB, SI->getIterator()); + SI->replaceAllUsesWith(NewSel); + RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI); + } + break; + } + } + + if (!FoundPreScan) { + DEBUG(dbgs() << "Have not found candidates for pmpy\n"); return false; + } if (!PV.Left) { + // The right shift version actually only returns the higher bits of + // the result (each iteration discards the LSB). If we want to convert it + // to a left-shifting loop, the working data type must be at least as + // wide as the target's pmpy instruction. + if (!promoteTypes(LoopB, ExitB)) + return false; convertShiftsToLeft(LoopB, ExitB, IterCount); cleanupLoopBody(LoopB); } - auto PostF = std::find_if(LoopB->begin(), LoopB->end(), CouldBePmpy(false)); - if (PostF == LoopB->end()) - return false; + // Scan the loop again, find the generating select instruction. + bool FoundScan = false; + for (Instruction &In : *LoopB) { + SelectInst *SelI = dyn_cast(&In); + if (!SelI) + continue; + DEBUG(dbgs() << "scanSelect: " << *SelI << '\n'); + FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false); + if (FoundScan) + break; + } + assert(FoundScan); DEBUG({ StringRef PP = (PV.M ? "(P+M)" : "P"); diff --git a/llvm/test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll b/llvm/test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll new file mode 100644 index 000000000000..9907ae71c992 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll @@ -0,0 +1,84 @@ +; Run -O2 to make sure that all the usual optimizations do happen before +; the Hexagon loop idiom recognition runs. This is to check that we still +; get this opportunity regardless of what happens before. + +; RUN: opt -O2 -march=hexagon -S < %s | FileCheck %s + +target triple = "hexagon" +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" + +; CHECK-LABEL: define zeroext i16 @pmpy_mod_lsr +; There need to be two pmpy instructions. +; CHECK: call i64 @llvm.hexagon.M4.pmpyw +; CHECK: call i64 @llvm.hexagon.M4.pmpyw + +define zeroext i16 @pmpy_mod_lsr(i8 zeroext %a0, i16 zeroext %a1) #0 { +b2: + br label %b3 + +b3: ; preds = %b44, %b2 + %v4 = phi i8 [ %a0, %b2 ], [ %v19, %b44 ] + %v5 = phi i16 [ %a1, %b2 ], [ %v43, %b44 ] + %v6 = phi i8 [ 0, %b2 ], [ %v45, %b44 ] + %v7 = zext i8 %v6 to i32 + %v8 = icmp slt i32 %v7, 8 + br i1 %v8, label %b9, label %b46 + +b9: ; preds = %b3 + %v10 = zext i8 %v4 to i32 + %v11 = and i32 %v10, 1 + %v12 = trunc i16 %v5 to i8 + %v13 = zext i8 %v12 to i32 + %v14 = and i32 %v13, 1 + %v15 = xor i32 %v11, %v14 + %v16 = trunc i32 %v15 to i8 + %v17 = zext i8 %v4 to i32 + %v18 = ashr i32 %v17, 1 + %v19 = trunc i32 %v18 to i8 + %v20 = zext i8 %v16 to i32 + %v21 = icmp eq i32 %v20, 1 + br i1 %v21, label %b22, label %b26 + +b22: ; preds = %b9 + %v23 = zext i16 %v5 to i32 + %v24 = xor i32 %v23, 16386 + %v25 = trunc i32 %v24 to i16 + br label %b27 + +b26: ; preds = %b9 + br label %b27 + +b27: ; preds = %b26, %b22 + %v28 = phi i16 [ %v25, %b22 ], [ %v5, %b26 ] + %v29 = phi i8 [ 1, %b22 ], [ 0, %b26 ] + %v30 = zext i16 %v28 to i32 + %v31 = ashr i32 %v30, 1 + %v32 = trunc i32 %v31 to i16 + %v33 = icmp ne i8 %v29, 0 + br i1 %v33, label %b34, label %b38 + +b34: ; preds = %b27 + %v35 = zext i16 %v32 to i32 + %v36 = or i32 %v35, 32768 + %v37 = trunc i32 %v36 to i16 + br label %b42 + +b38: ; preds = %b27 + %v39 = zext i16 %v32 to i32 + %v40 = and i32 %v39, 32767 + %v41 = trunc i32 %v40 to i16 + br label %b42 + +b42: ; preds = %b38, %b34 + %v43 = phi i16 [ %v37, %b34 ], [ %v41, %b38 ] + br label %b44 + +b44: ; preds = %b42 + %v45 = add i8 %v6, 1 + br label %b3 + +b46: ; preds = %b3 + ret i16 %v5 +} + +attributes #0 = { noinline nounwind "target-cpu"="hexagonv5" "target-features"="-hvx,-hvx-double,-long-calls" }