Commoning of target specific load/store intrinsics in Early CSE.

Phabricator revision: http://reviews.llvm.org/D7121
Patch by Sanjin Sijaric <ssijaric@codeaurora.org>!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227149 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chad Rosier 2015-01-26 22:51:15 +00:00
parent f96362358f
commit 13faabb6c5
6 changed files with 486 additions and 29 deletions

View File

@ -23,6 +23,7 @@
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Pass.h"
#include "llvm/Support/DataTypes.h"
@ -35,6 +36,20 @@ class Type;
class User;
class Value;
/// \brief Information about a load/store intrinsic defined by the target.
struct MemIntrinsicInfo {
MemIntrinsicInfo()
: ReadMem(false), WriteMem(false), Vol(false), MatchingId(0),
NumMemRefs(0), PtrVal(nullptr) {}
bool ReadMem;
bool WriteMem;
bool Vol;
// Same Id is set by the target for corresponding load/store intrinsics.
unsigned short MatchingId;
int NumMemRefs;
Value *PtrVal;
};
/// TargetTransformInfo - This pass provides access to the codegen
/// interfaces that are needed for IR-level transformations.
class TargetTransformInfo {
@ -443,6 +458,20 @@ public:
/// any callee-saved registers, so would require a spill and fill.
virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const;
/// \returns True if the intrinsic is a supported memory intrinsic. Info
/// will contain additional information - whether the intrinsic may write
/// or read to memory, volatility and the pointer. Info is undefined
/// if false is returned.
virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const;
/// \returns A value which is the result of the given memory intrinsic. New
/// instructions may be created to extract the result from the given intrinsic
/// memory operation. Returns nullptr if the target cannot create a result
/// from the given intrinsic.
virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType) const;
/// @}
/// Analysis group identification.

View File

@ -254,6 +254,16 @@ unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys)
return PrevTTI->getCostOfKeepingLiveOverCall(Tys);
}
Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
IntrinsicInst *Inst, Type *ExpectedType) const {
return PrevTTI->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
}
bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
return PrevTTI->getTgtMemIntrinsic(Inst, Info);
}
namespace {
struct NoTTI final : ImmutablePass, TargetTransformInfo {
@ -656,6 +666,15 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
return 0;
}
bool getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const override {
return false;
}
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType) const override {
return nullptr;
}
};
} // end anonymous namespace

View File

@ -44,6 +44,12 @@ class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
/// are set if the result needs to be inserted and/or extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
enum MemIntrinsicType {
VECTOR_LDST_TWO_ELEMENTS,
VECTOR_LDST_THREE_ELEMENTS,
VECTOR_LDST_FOUR_ELEMENTS
};
public:
AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
llvm_unreachable("This pass cannot be directly constructed");
@ -131,6 +137,11 @@ public:
void getUnrollingPreferences(const Function *F, Loop *L,
UnrollingPreferences &UP) const override;
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType) const override;
bool getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const override;
/// @}
};
@ -554,3 +565,83 @@ void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
}
Value *AArch64TTI::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType) const {
switch (Inst->getIntrinsicID()) {
default:
return nullptr;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4: {
// Create a struct type
StructType *ST = dyn_cast<StructType>(ExpectedType);
if (!ST)
return nullptr;
unsigned NumElts = Inst->getNumArgOperands() - 1;
if (ST->getNumElements() != NumElts)
return nullptr;
for (unsigned i = 0, e = NumElts; i != e; ++i) {
if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
return nullptr;
}
Value *Res = UndefValue::get(ExpectedType);
IRBuilder<> Builder(Inst);
for (unsigned i = 0, e = NumElts; i != e; ++i) {
Value *L = Inst->getArgOperand(i);
Res = Builder.CreateInsertValue(Res, L, i);
}
return Res;
}
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
if (Inst->getType() == ExpectedType)
return Inst;
return nullptr;
}
}
bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
default:
break;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
Info.ReadMem = true;
Info.WriteMem = false;
Info.Vol = false;
Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(0);
break;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
Info.ReadMem = false;
Info.WriteMem = true;
Info.Vol = false;
Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
break;
}
switch (Inst->getIntrinsicID()) {
default:
return false;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_st2:
Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
break;
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_st3:
Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
break;
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_st4:
Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
break;
}
return true;
}

View File

@ -18,6 +18,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
@ -273,6 +274,7 @@ class EarlyCSE : public FunctionPass {
public:
const DataLayout *DL;
const TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
DominatorTree *DT;
AssumptionCache *AC;
typedef RecyclingAllocator<
@ -383,14 +385,83 @@ private:
bool Processed;
};
/// \brief Wrapper class to handle memory instructions, including loads,
/// stores and intrinsic loads and stores defined by the target.
class ParseMemoryInst {
public:
ParseMemoryInst(Instruction *Inst, const TargetTransformInfo *TTI)
: Load(false), Store(false), Vol(false), MayReadFromMemory(false),
MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
MayReadFromMemory = Inst->mayReadFromMemory();
MayWriteToMemory = Inst->mayWriteToMemory();
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
MemIntrinsicInfo Info;
if (!TTI->getTgtMemIntrinsic(II, Info))
return;
if (Info.NumMemRefs == 1) {
Store = Info.WriteMem;
Load = Info.ReadMem;
MatchingId = Info.MatchingId;
MayReadFromMemory = Info.ReadMem;
MayWriteToMemory = Info.WriteMem;
Vol = Info.Vol;
Ptr = Info.PtrVal;
}
} else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
Load = true;
Vol = !LI->isSimple();
Ptr = LI->getPointerOperand();
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
Store = true;
Vol = !SI->isSimple();
Ptr = SI->getPointerOperand();
}
}
bool isLoad() { return Load; }
bool isStore() { return Store; }
bool isVolatile() { return Vol; }
bool isMatchingMemLoc(const ParseMemoryInst &Inst) {
return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
}
bool isValid() { return Ptr != nullptr; }
int getMatchingId() { return MatchingId; }
Value *getPtr() { return Ptr; }
bool mayReadFromMemory() { return MayReadFromMemory; }
bool mayWriteToMemory() { return MayWriteToMemory; }
private:
bool Load;
bool Store;
bool Vol;
bool MayReadFromMemory;
bool MayWriteToMemory;
// For regular (non-intrinsic) loads/stores, this is set to -1. For
// intrinsic loads/stores, the id is retrieved from the corresponding
// field in the MemIntrinsicInfo structure. That field contains
// non-negative values only.
int MatchingId;
Value *Ptr;
};
bool processNode(DomTreeNode *Node);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfo>();
AU.setPreservesCFG();
}
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
return LI;
else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
return SI->getValueOperand();
assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
return TTI->getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
ExpectedType);
}
};
}
@ -420,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
/// as long as there in no instruction that reads memory. If we see a store
/// to the same location, we delete the dead store. This zaps trivial dead
/// stores which can occur in bitfield code among other things.
StoreInst *LastStore = nullptr;
Instruction *LastStore = nullptr;
bool Changed = false;
@ -475,10 +546,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
ParseMemoryInst MemInst(Inst, TTI);
// If this is a non-volatile load, process it.
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
if (MemInst.isValid() && MemInst.isLoad()) {
// Ignore volatile loads.
if (!LI->isSimple()) {
if (MemInst.isVolatile()) {
LastStore = nullptr;
continue;
}
@ -486,27 +558,35 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// If we have an available version of this load, and if it is the right
// generation, replace this instruction.
std::pair<Value *, unsigned> InVal =
AvailableLoads->lookup(Inst->getOperand(0));
AvailableLoads->lookup(MemInst.getPtr());
if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
<< " to: " << *InVal.first << '\n');
if (!Inst->use_empty())
Inst->replaceAllUsesWith(InVal.first);
Inst->eraseFromParent();
Changed = true;
++NumCSELoad;
continue;
Value *Op = getOrCreateResult(InVal.first, Inst->getType());
if (Op != nullptr) {
DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
<< " to: " << *InVal.first << '\n');
if (!Inst->use_empty())
Inst->replaceAllUsesWith(Op);
Inst->eraseFromParent();
Changed = true;
++NumCSELoad;
continue;
}
}
// Otherwise, remember that we have this instruction.
AvailableLoads->insert(Inst->getOperand(0), std::pair<Value *, unsigned>(
Inst, CurrentGeneration));
AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
Inst, CurrentGeneration));
LastStore = nullptr;
continue;
}
// If this instruction may read from memory, forget LastStore.
if (Inst->mayReadFromMemory())
// Load/store intrinsics will indicate both a read and a write to
// memory. The target may override this (e.g. so that a store intrinsic
// does not read from memory, and thus will be treated the same as a
// regular store for commoning purposes).
if (Inst->mayReadFromMemory() &&
!(MemInst.isValid() && !MemInst.mayReadFromMemory()))
LastStore = nullptr;
// If this is a read-only call, process it.
@ -537,17 +617,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (Inst->mayWriteToMemory()) {
++CurrentGeneration;
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
if (MemInst.isValid() && MemInst.isStore()) {
// We do a trivial form of DSE if there are two stores to the same
// location with no intervening loads. Delete the earlier store.
if (LastStore &&
LastStore->getPointerOperand() == SI->getPointerOperand()) {
DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
<< " due to: " << *Inst << '\n');
LastStore->eraseFromParent();
Changed = true;
++NumDSE;
LastStore = nullptr;
if (LastStore) {
ParseMemoryInst LastStoreMemInst(LastStore, TTI);
if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
<< " due to: " << *Inst << '\n');
LastStore->eraseFromParent();
Changed = true;
++NumDSE;
LastStore = nullptr;
}
// fallthrough - we can exploit information about this store
}
@ -556,13 +638,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// version of the pointer. It is safe to forward from volatile stores
// to non-volatile loads, so we don't have to check for volatility of
// the store.
AvailableLoads->insert(SI->getPointerOperand(),
std::pair<Value *, unsigned>(
SI->getValueOperand(), CurrentGeneration));
AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
Inst, CurrentGeneration));
// Remember that this was the last store we saw for DSE.
if (SI->isSimple())
LastStore = SI;
if (!MemInst.isVolatile())
LastStore = Inst;
}
}
}
@ -584,6 +665,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

View File

@ -0,0 +1,231 @@
; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
; CHECK-LABEL: @test_cse
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
%cmp = icmp slt i32 %i.0, %n
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
%5 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}
define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
; CHECK-LABEL: @test_cse2
; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
%cmp = icmp slt i32 %i.0, %n
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
%5 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}
define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
entry:
; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
; CHECK-LABEL: @test_cse3
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
%cmp = icmp slt i32 %i.0, %n
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%1 = bitcast i32* %a to i8*
%vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
%vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
%vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}
define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
; away by Early CSE.
; CHECK-LABEL: @test_nocse
; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
%cmp = icmp slt i32 %i.0, %n
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
store i32 0, i32* %b, align 4
%5 = bitcast i32* %a to i8*
%vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
%vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
%vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}
define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
; to mismatch between st2 and ld3.
; CHECK-LABEL: @test_nocse2
; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
%cmp = icmp slt i32 %i.0, %n
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
%5 = bitcast i32* %a to i8*
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
%vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
%vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}
define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
entry:
; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
; mismatch between st2 and st3.
; CHECK-LABEL: @test_nocse3
; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
%s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
%s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
%cmp = icmp slt i32 %i.0, %n
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%0 = bitcast i32* %a to i8*
%1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
%2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
%3 = bitcast <16 x i8> %1 to <4 x i32>
%4 = bitcast <16 x i8> %2 to <4 x i32>
call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
%5 = bitcast i32* %a to i8*
%vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
%vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
%vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
%call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end: ; preds = %for.cond
ret <4 x i32> %res.0
}
; Function Attrs: nounwind
declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
; Function Attrs: nounwind
declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
; Function Attrs: nounwind readonly
declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
; Function Attrs: nounwind readonly
declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
entry:
%add = add <4 x i32> %__p0, %__p1
ret <4 x i32> %add
}

View File

@ -0,0 +1,5 @@
config.suffixes = ['.ll']
targets = set(config.root.targets_to_build.split())
if not 'AArch64' in targets:
config.unsupported = True