mirror of
https://github.com/RPCSX/llvm.git
synced 2025-02-02 18:42:36 +00:00
Commoning of target specific load/store intrinsics in Early CSE.
Phabricator revision: http://reviews.llvm.org/D7121 Patch by Sanjin Sijaric <ssijaric@codeaurora.org>! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227149 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f96362358f
commit
13faabb6c5
@ -23,6 +23,7 @@
|
||||
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
|
||||
|
||||
#include "llvm/IR/Intrinsics.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/DataTypes.h"
|
||||
|
||||
@ -35,6 +36,20 @@ class Type;
|
||||
class User;
|
||||
class Value;
|
||||
|
||||
/// \brief Information about a load/store intrinsic defined by the target.
|
||||
struct MemIntrinsicInfo {
|
||||
MemIntrinsicInfo()
|
||||
: ReadMem(false), WriteMem(false), Vol(false), MatchingId(0),
|
||||
NumMemRefs(0), PtrVal(nullptr) {}
|
||||
bool ReadMem;
|
||||
bool WriteMem;
|
||||
bool Vol;
|
||||
// Same Id is set by the target for corresponding load/store intrinsics.
|
||||
unsigned short MatchingId;
|
||||
int NumMemRefs;
|
||||
Value *PtrVal;
|
||||
};
|
||||
|
||||
/// TargetTransformInfo - This pass provides access to the codegen
|
||||
/// interfaces that are needed for IR-level transformations.
|
||||
class TargetTransformInfo {
|
||||
@ -443,6 +458,20 @@ public:
|
||||
/// any callee-saved registers, so would require a spill and fill.
|
||||
virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const;
|
||||
|
||||
/// \returns True if the intrinsic is a supported memory intrinsic. Info
|
||||
/// will contain additional information - whether the intrinsic may write
|
||||
/// or read to memory, volatility and the pointer. Info is undefined
|
||||
/// if false is returned.
|
||||
virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
|
||||
MemIntrinsicInfo &Info) const;
|
||||
|
||||
/// \returns A value which is the result of the given memory intrinsic. New
|
||||
/// instructions may be created to extract the result from the given intrinsic
|
||||
/// memory operation. Returns nullptr if the target cannot create a result
|
||||
/// from the given intrinsic.
|
||||
virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
|
||||
Type *ExpectedType) const;
|
||||
|
||||
/// @}
|
||||
|
||||
/// Analysis group identification.
|
||||
|
@ -254,6 +254,16 @@ unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys)
|
||||
return PrevTTI->getCostOfKeepingLiveOverCall(Tys);
|
||||
}
|
||||
|
||||
Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
|
||||
IntrinsicInst *Inst, Type *ExpectedType) const {
|
||||
return PrevTTI->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst,
|
||||
MemIntrinsicInfo &Info) const {
|
||||
return PrevTTI->getTgtMemIntrinsic(Inst, Info);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct NoTTI final : ImmutablePass, TargetTransformInfo {
|
||||
@ -656,6 +666,15 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool getTgtMemIntrinsic(IntrinsicInst *Inst,
|
||||
MemIntrinsicInfo &Info) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
|
||||
Type *ExpectedType) const override {
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
@ -44,6 +44,12 @@ class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
|
||||
/// are set if the result needs to be inserted and/or extracted from vectors.
|
||||
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
|
||||
|
||||
enum MemIntrinsicType {
|
||||
VECTOR_LDST_TWO_ELEMENTS,
|
||||
VECTOR_LDST_THREE_ELEMENTS,
|
||||
VECTOR_LDST_FOUR_ELEMENTS
|
||||
};
|
||||
|
||||
public:
|
||||
AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
|
||||
llvm_unreachable("This pass cannot be directly constructed");
|
||||
@ -131,6 +137,11 @@ public:
|
||||
void getUnrollingPreferences(const Function *F, Loop *L,
|
||||
UnrollingPreferences &UP) const override;
|
||||
|
||||
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
|
||||
Type *ExpectedType) const override;
|
||||
|
||||
bool getTgtMemIntrinsic(IntrinsicInst *Inst,
|
||||
MemIntrinsicInfo &Info) const override;
|
||||
|
||||
/// @}
|
||||
};
|
||||
@ -554,3 +565,83 @@ void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
|
||||
// Disable partial & runtime unrolling on -Os.
|
||||
UP.PartialOptSizeThreshold = 0;
|
||||
}
|
||||
|
||||
Value *AArch64TTI::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
|
||||
Type *ExpectedType) const {
|
||||
switch (Inst->getIntrinsicID()) {
|
||||
default:
|
||||
return nullptr;
|
||||
case Intrinsic::aarch64_neon_st2:
|
||||
case Intrinsic::aarch64_neon_st3:
|
||||
case Intrinsic::aarch64_neon_st4: {
|
||||
// Create a struct type
|
||||
StructType *ST = dyn_cast<StructType>(ExpectedType);
|
||||
if (!ST)
|
||||
return nullptr;
|
||||
unsigned NumElts = Inst->getNumArgOperands() - 1;
|
||||
if (ST->getNumElements() != NumElts)
|
||||
return nullptr;
|
||||
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
||||
if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
|
||||
return nullptr;
|
||||
}
|
||||
Value *Res = UndefValue::get(ExpectedType);
|
||||
IRBuilder<> Builder(Inst);
|
||||
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
||||
Value *L = Inst->getArgOperand(i);
|
||||
Res = Builder.CreateInsertValue(Res, L, i);
|
||||
}
|
||||
return Res;
|
||||
}
|
||||
case Intrinsic::aarch64_neon_ld2:
|
||||
case Intrinsic::aarch64_neon_ld3:
|
||||
case Intrinsic::aarch64_neon_ld4:
|
||||
if (Inst->getType() == ExpectedType)
|
||||
return Inst;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst,
|
||||
MemIntrinsicInfo &Info) const {
|
||||
switch (Inst->getIntrinsicID()) {
|
||||
default:
|
||||
break;
|
||||
case Intrinsic::aarch64_neon_ld2:
|
||||
case Intrinsic::aarch64_neon_ld3:
|
||||
case Intrinsic::aarch64_neon_ld4:
|
||||
Info.ReadMem = true;
|
||||
Info.WriteMem = false;
|
||||
Info.Vol = false;
|
||||
Info.NumMemRefs = 1;
|
||||
Info.PtrVal = Inst->getArgOperand(0);
|
||||
break;
|
||||
case Intrinsic::aarch64_neon_st2:
|
||||
case Intrinsic::aarch64_neon_st3:
|
||||
case Intrinsic::aarch64_neon_st4:
|
||||
Info.ReadMem = false;
|
||||
Info.WriteMem = true;
|
||||
Info.Vol = false;
|
||||
Info.NumMemRefs = 1;
|
||||
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
|
||||
break;
|
||||
}
|
||||
|
||||
switch (Inst->getIntrinsicID()) {
|
||||
default:
|
||||
return false;
|
||||
case Intrinsic::aarch64_neon_ld2:
|
||||
case Intrinsic::aarch64_neon_st2:
|
||||
Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
|
||||
break;
|
||||
case Intrinsic::aarch64_neon_ld3:
|
||||
case Intrinsic::aarch64_neon_st3:
|
||||
Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
|
||||
break;
|
||||
case Intrinsic::aarch64_neon_ld4:
|
||||
case Intrinsic::aarch64_neon_st4:
|
||||
Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/Analysis/AssumptionCache.h"
|
||||
#include "llvm/Analysis/InstructionSimplify.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Dominators.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
@ -273,6 +274,7 @@ class EarlyCSE : public FunctionPass {
|
||||
public:
|
||||
const DataLayout *DL;
|
||||
const TargetLibraryInfo *TLI;
|
||||
const TargetTransformInfo *TTI;
|
||||
DominatorTree *DT;
|
||||
AssumptionCache *AC;
|
||||
typedef RecyclingAllocator<
|
||||
@ -383,14 +385,83 @@ private:
|
||||
bool Processed;
|
||||
};
|
||||
|
||||
/// \brief Wrapper class to handle memory instructions, including loads,
|
||||
/// stores and intrinsic loads and stores defined by the target.
|
||||
class ParseMemoryInst {
|
||||
public:
|
||||
ParseMemoryInst(Instruction *Inst, const TargetTransformInfo *TTI)
|
||||
: Load(false), Store(false), Vol(false), MayReadFromMemory(false),
|
||||
MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
|
||||
MayReadFromMemory = Inst->mayReadFromMemory();
|
||||
MayWriteToMemory = Inst->mayWriteToMemory();
|
||||
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
|
||||
MemIntrinsicInfo Info;
|
||||
if (!TTI->getTgtMemIntrinsic(II, Info))
|
||||
return;
|
||||
if (Info.NumMemRefs == 1) {
|
||||
Store = Info.WriteMem;
|
||||
Load = Info.ReadMem;
|
||||
MatchingId = Info.MatchingId;
|
||||
MayReadFromMemory = Info.ReadMem;
|
||||
MayWriteToMemory = Info.WriteMem;
|
||||
Vol = Info.Vol;
|
||||
Ptr = Info.PtrVal;
|
||||
}
|
||||
} else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
|
||||
Load = true;
|
||||
Vol = !LI->isSimple();
|
||||
Ptr = LI->getPointerOperand();
|
||||
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
|
||||
Store = true;
|
||||
Vol = !SI->isSimple();
|
||||
Ptr = SI->getPointerOperand();
|
||||
}
|
||||
}
|
||||
bool isLoad() { return Load; }
|
||||
bool isStore() { return Store; }
|
||||
bool isVolatile() { return Vol; }
|
||||
bool isMatchingMemLoc(const ParseMemoryInst &Inst) {
|
||||
return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
|
||||
}
|
||||
bool isValid() { return Ptr != nullptr; }
|
||||
int getMatchingId() { return MatchingId; }
|
||||
Value *getPtr() { return Ptr; }
|
||||
bool mayReadFromMemory() { return MayReadFromMemory; }
|
||||
bool mayWriteToMemory() { return MayWriteToMemory; }
|
||||
|
||||
private:
|
||||
bool Load;
|
||||
bool Store;
|
||||
bool Vol;
|
||||
bool MayReadFromMemory;
|
||||
bool MayWriteToMemory;
|
||||
// For regular (non-intrinsic) loads/stores, this is set to -1. For
|
||||
// intrinsic loads/stores, the id is retrieved from the corresponding
|
||||
// field in the MemIntrinsicInfo structure. That field contains
|
||||
// non-negative values only.
|
||||
int MatchingId;
|
||||
Value *Ptr;
|
||||
};
|
||||
|
||||
bool processNode(DomTreeNode *Node);
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<AssumptionCacheTracker>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
||||
AU.addRequired<TargetTransformInfo>();
|
||||
AU.setPreservesCFG();
|
||||
}
|
||||
|
||||
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
|
||||
if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
|
||||
return LI;
|
||||
else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
|
||||
return SI->getValueOperand();
|
||||
assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
|
||||
return TTI->getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
|
||||
ExpectedType);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@ -420,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
|
||||
/// as long as there in no instruction that reads memory. If we see a store
|
||||
/// to the same location, we delete the dead store. This zaps trivial dead
|
||||
/// stores which can occur in bitfield code among other things.
|
||||
StoreInst *LastStore = nullptr;
|
||||
Instruction *LastStore = nullptr;
|
||||
|
||||
bool Changed = false;
|
||||
|
||||
@ -475,10 +546,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ParseMemoryInst MemInst(Inst, TTI);
|
||||
// If this is a non-volatile load, process it.
|
||||
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
|
||||
if (MemInst.isValid() && MemInst.isLoad()) {
|
||||
// Ignore volatile loads.
|
||||
if (!LI->isSimple()) {
|
||||
if (MemInst.isVolatile()) {
|
||||
LastStore = nullptr;
|
||||
continue;
|
||||
}
|
||||
@ -486,27 +558,35 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
|
||||
// If we have an available version of this load, and if it is the right
|
||||
// generation, replace this instruction.
|
||||
std::pair<Value *, unsigned> InVal =
|
||||
AvailableLoads->lookup(Inst->getOperand(0));
|
||||
AvailableLoads->lookup(MemInst.getPtr());
|
||||
if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
|
||||
DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
|
||||
<< " to: " << *InVal.first << '\n');
|
||||
if (!Inst->use_empty())
|
||||
Inst->replaceAllUsesWith(InVal.first);
|
||||
Inst->eraseFromParent();
|
||||
Changed = true;
|
||||
++NumCSELoad;
|
||||
continue;
|
||||
Value *Op = getOrCreateResult(InVal.first, Inst->getType());
|
||||
if (Op != nullptr) {
|
||||
DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
|
||||
<< " to: " << *InVal.first << '\n');
|
||||
if (!Inst->use_empty())
|
||||
Inst->replaceAllUsesWith(Op);
|
||||
Inst->eraseFromParent();
|
||||
Changed = true;
|
||||
++NumCSELoad;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, remember that we have this instruction.
|
||||
AvailableLoads->insert(Inst->getOperand(0), std::pair<Value *, unsigned>(
|
||||
Inst, CurrentGeneration));
|
||||
AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
|
||||
Inst, CurrentGeneration));
|
||||
LastStore = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If this instruction may read from memory, forget LastStore.
|
||||
if (Inst->mayReadFromMemory())
|
||||
// Load/store intrinsics will indicate both a read and a write to
|
||||
// memory. The target may override this (e.g. so that a store intrinsic
|
||||
// does not read from memory, and thus will be treated the same as a
|
||||
// regular store for commoning purposes).
|
||||
if (Inst->mayReadFromMemory() &&
|
||||
!(MemInst.isValid() && !MemInst.mayReadFromMemory()))
|
||||
LastStore = nullptr;
|
||||
|
||||
// If this is a read-only call, process it.
|
||||
@ -537,17 +617,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
|
||||
if (Inst->mayWriteToMemory()) {
|
||||
++CurrentGeneration;
|
||||
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
|
||||
if (MemInst.isValid() && MemInst.isStore()) {
|
||||
// We do a trivial form of DSE if there are two stores to the same
|
||||
// location with no intervening loads. Delete the earlier store.
|
||||
if (LastStore &&
|
||||
LastStore->getPointerOperand() == SI->getPointerOperand()) {
|
||||
DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
|
||||
<< " due to: " << *Inst << '\n');
|
||||
LastStore->eraseFromParent();
|
||||
Changed = true;
|
||||
++NumDSE;
|
||||
LastStore = nullptr;
|
||||
if (LastStore) {
|
||||
ParseMemoryInst LastStoreMemInst(LastStore, TTI);
|
||||
if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
|
||||
DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
|
||||
<< " due to: " << *Inst << '\n');
|
||||
LastStore->eraseFromParent();
|
||||
Changed = true;
|
||||
++NumDSE;
|
||||
LastStore = nullptr;
|
||||
}
|
||||
// fallthrough - we can exploit information about this store
|
||||
}
|
||||
|
||||
@ -556,13 +638,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
|
||||
// version of the pointer. It is safe to forward from volatile stores
|
||||
// to non-volatile loads, so we don't have to check for volatility of
|
||||
// the store.
|
||||
AvailableLoads->insert(SI->getPointerOperand(),
|
||||
std::pair<Value *, unsigned>(
|
||||
SI->getValueOperand(), CurrentGeneration));
|
||||
AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
|
||||
Inst, CurrentGeneration));
|
||||
|
||||
// Remember that this was the last store we saw for DSE.
|
||||
if (SI->isSimple())
|
||||
LastStore = SI;
|
||||
if (!MemInst.isVolatile())
|
||||
LastStore = Inst;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -584,6 +665,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
|
||||
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
|
||||
DL = DLP ? &DLP->getDataLayout() : nullptr;
|
||||
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
|
||||
TTI = &getAnalysis<TargetTransformInfo>();
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
||||
|
||||
|
231
test/Transforms/EarlyCSE/AArch64/intrinsics.ll
Normal file
231
test/Transforms/EarlyCSE/AArch64/intrinsics.ll
Normal file
@ -0,0 +1,231 @@
|
||||
; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
|
||||
|
||||
define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
|
||||
entry:
|
||||
; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
|
||||
; CHECK-LABEL: @test_cse
|
||||
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
|
||||
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
|
||||
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.body, %entry
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
|
||||
%cmp = icmp slt i32 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%0 = bitcast i32* %a to i8*
|
||||
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
|
||||
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
|
||||
%3 = bitcast <16 x i8> %1 to <4 x i32>
|
||||
%4 = bitcast <16 x i8> %2 to <4 x i32>
|
||||
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
|
||||
%5 = bitcast i32* %a to i8*
|
||||
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
|
||||
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
|
||||
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret <4 x i32> %res.0
|
||||
}
|
||||
|
||||
define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
|
||||
entry:
|
||||
; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
|
||||
; CHECK-LABEL: @test_cse2
|
||||
; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
|
||||
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
|
||||
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
|
||||
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.body, %entry
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
|
||||
%cmp = icmp slt i32 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%0 = bitcast i32* %a to i8*
|
||||
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
|
||||
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
|
||||
%3 = bitcast <16 x i8> %1 to <4 x i32>
|
||||
%4 = bitcast <16 x i8> %2 to <4 x i32>
|
||||
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
|
||||
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
|
||||
%5 = bitcast i32* %a to i8*
|
||||
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
|
||||
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
|
||||
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret <4 x i32> %res.0
|
||||
}
|
||||
|
||||
define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
|
||||
entry:
|
||||
; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
|
||||
; CHECK-LABEL: @test_cse3
|
||||
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
|
||||
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
|
||||
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
|
||||
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.body, %entry
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
|
||||
%cmp = icmp slt i32 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%0 = bitcast i32* %a to i8*
|
||||
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
|
||||
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
|
||||
%1 = bitcast i32* %a to i8*
|
||||
%vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
|
||||
%vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
|
||||
%vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
|
||||
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret <4 x i32> %res.0
|
||||
}
|
||||
|
||||
|
||||
define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
|
||||
entry:
|
||||
; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
|
||||
; away by Early CSE.
|
||||
; CHECK-LABEL: @test_nocse
|
||||
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
|
||||
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
|
||||
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.body, %entry
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
|
||||
%cmp = icmp slt i32 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%0 = bitcast i32* %a to i8*
|
||||
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
|
||||
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
|
||||
%3 = bitcast <16 x i8> %1 to <4 x i32>
|
||||
%4 = bitcast <16 x i8> %2 to <4 x i32>
|
||||
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
|
||||
store i32 0, i32* %b, align 4
|
||||
%5 = bitcast i32* %a to i8*
|
||||
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
|
||||
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
|
||||
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret <4 x i32> %res.0
|
||||
}
|
||||
|
||||
define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
|
||||
entry:
|
||||
; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
|
||||
; to mismatch between st2 and ld3.
|
||||
; CHECK-LABEL: @test_nocse2
|
||||
; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
|
||||
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
|
||||
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.body, %entry
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
|
||||
%cmp = icmp slt i32 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%0 = bitcast i32* %a to i8*
|
||||
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
|
||||
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
|
||||
%3 = bitcast <16 x i8> %1 to <4 x i32>
|
||||
%4 = bitcast <16 x i8> %2 to <4 x i32>
|
||||
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
|
||||
%5 = bitcast i32* %a to i8*
|
||||
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
|
||||
%vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
|
||||
%vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
|
||||
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret <4 x i32> %res.0
|
||||
}
|
||||
|
||||
define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
|
||||
entry:
|
||||
; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
|
||||
; mismatch between st2 and st3.
|
||||
; CHECK-LABEL: @test_nocse3
|
||||
; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
|
||||
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
|
||||
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
|
||||
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.body, %entry
|
||||
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
|
||||
%cmp = icmp slt i32 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%0 = bitcast i32* %a to i8*
|
||||
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
|
||||
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
|
||||
%3 = bitcast <16 x i8> %1 to <4 x i32>
|
||||
%4 = bitcast <16 x i8> %2 to <4 x i32>
|
||||
call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
|
||||
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
|
||||
%5 = bitcast i32* %a to i8*
|
||||
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
|
||||
%vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
|
||||
%vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
|
||||
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
|
||||
%inc = add nsw i32 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret <4 x i32> %res.0
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
|
||||
|
||||
; Function Attrs: nounwind readonly
|
||||
declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
|
||||
|
||||
; Function Attrs: nounwind readonly
|
||||
declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
|
||||
|
||||
define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
|
||||
entry:
|
||||
%add = add <4 x i32> %__p0, %__p1
|
||||
ret <4 x i32> %add
|
||||
}
|
5
test/Transforms/EarlyCSE/AArch64/lit.local.cfg
Normal file
5
test/Transforms/EarlyCSE/AArch64/lit.local.cfg
Normal file
@ -0,0 +1,5 @@
|
||||
config.suffixes = ['.ll']
|
||||
|
||||
targets = set(config.root.targets_to_build.split())
|
||||
if not 'AArch64' in targets:
|
||||
config.unsupported = True
|
Loading…
x
Reference in New Issue
Block a user