[ARM/AArch64] Support wide interleaved accesses

This patch teaches (ARM|AArch64)ISelLowering.cpp to match illegal vector types
to interleaved access intrinsics as long as the types are multiples of the
vector register width. A "wide" access will now be mapped to multiple
interleave intrinsics similar to the way in which non-interleaved accesses with
illegal types are legalized into multiple accesses. I'll update the associated
TTI costs (in getInterleavedMemoryOpCost) as a follow-on.

Differential Revision: https://reviews.llvm.org/D29466

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296750 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matthew Simpson 2017-03-02 15:11:20 +00:00
parent 0aedea1a56
commit 6523b45d2e
4 changed files with 661 additions and 98 deletions

View File

@ -7248,6 +7248,13 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
return NumBits == 32 || NumBits == 64; return NumBits == 32 || NumBits == 64;
} }
/// A helper function for determining the number of interleaved accesses we
/// will generate when lowering accesses of the given type.
static unsigned getNumInterleavedAccesses(VectorType *VecTy,
const DataLayout &DL) {
return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
}
/// \brief Lower an interleaved load into a ldN intrinsic. /// \brief Lower an interleaved load into a ldN intrinsic.
/// ///
/// E.g. Lower an interleaved load (Factor = 2): /// E.g. Lower an interleaved load (Factor = 2):
@ -7273,10 +7280,14 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
VectorType *VecTy = Shuffles[0]->getType(); VectorType *VecTy = Shuffles[0]->getType();
unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned VecSize = DL.getTypeSizeInBits(VecTy);
// Skip if we do not have NEON and skip illegal vector types. // Skip if we do not have NEON and skip illegal vector types. We can
if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) // "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0))
return false; return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
// A pointer vector can not be the return type of the ldN intrinsics. Need to // A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors. // load integer vectors first and then convert to pointer vectors.
Type *EltTy = VecTy->getVectorElementType(); Type *EltTy = VecTy->getVectorElementType();
@ -7284,6 +7295,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
VecTy = VecTy =
VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
IRBuilder<> Builder(LI);
// The base address of the load.
Value *BaseAddr = LI->getPointerOperand();
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
VecTy = VectorType::get(VecTy->getVectorElementType(),
VecTy->getVectorNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, VecTy->getVectorElementType()->getPointerTo(
LI->getPointerAddressSpace()));
}
Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
Type *Tys[2] = {VecTy, PtrTy}; Type *Tys[2] = {VecTy, PtrTy};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
@ -7292,24 +7322,46 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
Function *LdNFunc = Function *LdNFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
IRBuilder<> Builder(LI); // Holds sub-vectors extracted from the load intrinsic return values. The
Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); // sub-vectors are associated with the shufflevector instructions they will
// replace.
DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
// Replace uses of each shufflevector with the corresponding vector loaded // If we're generating more than one load, compute the base address of
// by ldN. // subsequent loads as an offset from the previous.
for (unsigned i = 0; i < Shuffles.size(); i++) { if (LoadCount > 0)
ShuffleVectorInst *SVI = Shuffles[i]; BaseAddr = Builder.CreateConstGEP1_32(
unsigned Index = Indices[i]; BaseAddr, VecTy->getVectorNumElements() * Factor);
Value *SubVec = Builder.CreateExtractValue(LdN, Index); CallInst *LdN = Builder.CreateCall(
LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
// Convert the integer vector to pointer vector if the element is pointer. // Extract and store the sub-vectors returned by the load intrinsic.
if (EltTy->isPointerTy()) for (unsigned i = 0; i < Shuffles.size(); i++) {
SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); ShuffleVectorInst *SVI = Shuffles[i];
unsigned Index = Indices[i];
SVI->replaceAllUsesWith(SubVec); Value *SubVec = Builder.CreateExtractValue(LdN, Index);
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
SubVecs[SVI].push_back(SubVec);
}
}
// Replace uses of the shufflevector instructions with the sub-vectors
// returned by the load intrinsic. If a shufflevector instruction is
// associated with more than one sub-vector, those sub-vectors will be
// concatenated into a single wide vector.
for (ShuffleVectorInst *SVI : Shuffles) {
auto &SubVec = SubVecs[SVI];
auto *WideVec =
SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
SVI->replaceAllUsesWith(WideVec);
} }
return true; return true;
@ -7358,10 +7410,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
const DataLayout &DL = SI->getModule()->getDataLayout(); const DataLayout &DL = SI->getModule()->getDataLayout();
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
// Skip if we do not have NEON and skip illegal vector types. // Skip if we do not have NEON and skip illegal vector types. We can
if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) // "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0))
return false; return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
Value *Op0 = SVI->getOperand(0); Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1); Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI); IRBuilder<> Builder(SI);
@ -7381,6 +7437,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
SubVecTy = VectorType::get(IntTy, LaneLen); SubVecTy = VectorType::get(IntTy, LaneLen);
} }
// The base address of the store.
Value *BaseAddr = SI->getPointerOperand();
if (NumStores > 1) {
// If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
SI->getPointerAddressSpace()));
}
auto Mask = SVI->getShuffleMask();
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
Type *Tys[2] = {SubVecTy, PtrTy}; Type *Tys[2] = {SubVecTy, PtrTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
@ -7389,34 +7464,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
Function *StNFunc = Function *StNFunc =
Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
SmallVector<Value *, 5> Ops; for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
// Split the shufflevector operands into sub vectors for the new stN call. SmallVector<Value *, 5> Ops;
auto Mask = SVI->getShuffleMask();
for (unsigned i = 0; i < Factor; i++) { // Split the shufflevector operands into sub vectors for the new stN call.
if (Mask[i] >= 0) { for (unsigned i = 0; i < Factor; i++) {
Ops.push_back(Builder.CreateShuffleVector( unsigned IdxI = StoreCount * LaneLen * Factor + i;
Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0))); if (Mask[IdxI] >= 0) {
} else { Ops.push_back(Builder.CreateShuffleVector(
unsigned StartMask = 0; Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
for (unsigned j = 1; j < LaneLen; j++) { } else {
if (Mask[j*Factor + i] >= 0) { unsigned StartMask = 0;
StartMask = Mask[j*Factor + i] - j; for (unsigned j = 1; j < LaneLen; j++) {
break; unsigned IdxJ = StoreCount * LaneLen * Factor + j;
if (Mask[IdxJ * Factor + IdxI] >= 0) {
StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
break;
}
} }
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
} }
// Note: If all elements in a chunk are undefs, StartMask=0!
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
} }
}
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); // If we generating more than one store, we compute the base address of
Builder.CreateCall(StNFunc, Ops); // subsequent stores as an offset from the previous.
if (StoreCount > 0)
BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
Builder.CreateCall(StNFunc, Ops);
}
return true; return true;
} }

View File

@ -13282,6 +13282,13 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Addr}); Addr});
} }
/// A helper function for determining the number of interleaved accesses we
/// will generate when lowering accesses of the given type.
static unsigned getNumInterleavedAccesses(VectorType *VecTy,
const DataLayout &DL) {
return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
}
/// \brief Lower an interleaved load into a vldN intrinsic. /// \brief Lower an interleaved load into a vldN intrinsic.
/// ///
/// E.g. Lower an interleaved load (Factor = 2): /// E.g. Lower an interleaved load (Factor = 2):
@ -13310,8 +13317,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
// Skip if we do not have NEON and skip illegal vector types and vector types // Skip if we do not have NEON and skip illegal vector types and vector types
// with i64/f64 elements (vldN doesn't support i64/f64 elements). // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can
if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) // "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0) ||
EltIs64Bits)
return false; return false;
// Skip if the vector has f16 elements: even though we could do an i16 vldN, // Skip if the vector has f16 elements: even though we could do an i16 vldN,
@ -13319,43 +13329,87 @@ bool ARMTargetLowering::lowerInterleavedLoad(
if (EltTy->isHalfTy()) if (EltTy->isHalfTy())
return false; return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
// A pointer vector can not be the return type of the ldN intrinsics. Need to // A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors. // load integer vectors first and then convert to pointer vectors.
if (EltTy->isPointerTy()) if (EltTy->isPointerTy())
VecTy = VecTy =
VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
Intrinsic::arm_neon_vld3,
Intrinsic::arm_neon_vld4};
IRBuilder<> Builder(LI); IRBuilder<> Builder(LI);
SmallVector<Value *, 2> Ops;
Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); // The base address of the load.
Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Value *BaseAddr = LI->getPointerOperand();
Ops.push_back(Builder.getInt32(LI->getAlignment()));
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
VecTy = VectorType::get(VecTy->getVectorElementType(),
VecTy->getVectorNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, VecTy->getVectorElementType()->getPointerTo(
LI->getPointerAddressSpace()));
}
assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
Type *Tys[] = { VecTy, Int8Ptr }; Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
Type *Tys[] = {VecTy, Int8Ptr};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
Intrinsic::arm_neon_vld3,
Intrinsic::arm_neon_vld4};
Function *VldnFunc = Function *VldnFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
// Replace uses of each shufflevector with the corresponding vector loaded // Holds sub-vectors extracted from the load intrinsic return values. The
// by ldN. // sub-vectors are associated with the shufflevector instructions they will
for (unsigned i = 0; i < Shuffles.size(); i++) { // replace.
ShuffleVectorInst *SV = Shuffles[i]; DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
unsigned Index = Indices[i];
Value *SubVec = Builder.CreateExtractValue(VldN, Index); for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
// Convert the integer vector to pointer vector if the element is pointer. // If we're generating more than one load, compute the base address of
if (EltTy->isPointerTy()) // subsequent loads as an offset from the previous.
SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); if (LoadCount > 0)
BaseAddr = Builder.CreateConstGEP1_32(
BaseAddr, VecTy->getVectorNumElements() * Factor);
SV->replaceAllUsesWith(SubVec); SmallVector<Value *, 2> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
Ops.push_back(Builder.getInt32(LI->getAlignment()));
CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
// Replace uses of each shufflevector with the corresponding vector loaded
// by ldN.
for (unsigned i = 0; i < Shuffles.size(); i++) {
ShuffleVectorInst *SV = Shuffles[i];
unsigned Index = Indices[i];
Value *SubVec = Builder.CreateExtractValue(VldN, Index);
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
SubVecs[SV].push_back(SubVec);
}
}
// Replace uses of the shufflevector instructions with the sub-vectors
// returned by the load intrinsic. If a shufflevector instruction is
// associated with more than one sub-vector, those sub-vectors will be
// concatenated into a single wide vector.
for (ShuffleVectorInst *SVI : Shuffles) {
auto &SubVec = SubVecs[SVI];
auto *WideVec =
SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
SVI->replaceAllUsesWith(WideVec);
} }
return true; return true;
@ -13406,8 +13460,10 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
// Skip if we do not have NEON and skip illegal vector types and vector types // Skip if we do not have NEON and skip illegal vector types and vector types
// with i64/f64 elements (vstN doesn't support i64/f64 elements). // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can
if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || // "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0) ||
EltIs64Bits) EltIs64Bits)
return false; return false;
@ -13416,6 +13472,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
if (EltTy->isHalfTy()) if (EltTy->isHalfTy())
return false; return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
Value *Op0 = SVI->getOperand(0); Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1); Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI); IRBuilder<> Builder(SI);
@ -13434,46 +13492,75 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
SubVecTy = VectorType::get(IntTy, LaneLen); SubVecTy = VectorType::get(IntTy, LaneLen);
} }
static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, // The base address of the store.
Intrinsic::arm_neon_vst3, Value *BaseAddr = SI->getPointerOperand();
Intrinsic::arm_neon_vst4};
SmallVector<Value *, 6> Ops;
Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); if (NumStores > 1) {
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); // If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
SI->getPointerAddressSpace()));
}
assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
Type *Tys[] = { Int8Ptr, SubVecTy };
Function *VstNFunc = Intrinsic::getDeclaration(
SI->getModule(), StoreInts[Factor - 2], Tys);
// Split the shufflevector operands into sub vectors for the new vstN call.
auto Mask = SVI->getShuffleMask(); auto Mask = SVI->getShuffleMask();
for (unsigned i = 0; i < Factor; i++) {
if (Mask[i] >= 0) {
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
if (Mask[j*Factor + i] >= 0) {
StartMask = Mask[j*Factor + i] - j;
break;
}
}
// Note: If all elements in a chunk are undefs, StartMask=0!
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
}
Ops.push_back(Builder.getInt32(SI->getAlignment())); Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
Builder.CreateCall(VstNFunc, Ops); Type *Tys[] = {Int8Ptr, SubVecTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
Intrinsic::arm_neon_vst3,
Intrinsic::arm_neon_vst4};
for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
SmallVector<Value *, 6> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
Function *VstNFunc =
Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
// Split the shufflevector operands into sub vectors for the new vstN call.
for (unsigned i = 0; i < Factor; i++) {
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
unsigned IdxJ = StoreCount * LaneLen * Factor + j;
if (Mask[IdxJ * Factor + IdxI] >= 0) {
StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
break;
}
}
// Note: If all elements in a chunk are undefs, StartMask=0!
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
}
Ops.push_back(Builder.getInt32(SI->getAlignment()));
Builder.CreateCall(VstNFunc, Ops);
}
return true; return true;
} }

View File

@ -565,3 +565,198 @@ define void @no_interleave(<4 x float> %a0) {
store <4 x float> %v0, <4 x float>* @g, align 16 store <4 x float> %v0, <4 x float>* @g, align 16
ret void ret void
} }
define void @load_factor2_wide2(<16 x i32>* %ptr) {
; NEON-LABEL: @load_factor2_wide2(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor2_wide2(
; NO_NEON-NOT: @llvm.aarch64.neon
; NO_NEON: ret void
;
%interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
%v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret void
}
define void @load_factor2_wide3(<24 x i32>* %ptr) {
; NEON-LABEL: @load_factor2_wide3(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
; NEON-NEXT: [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP10]])
; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1
; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0
; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; NEON-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; NEON-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor2_wide3(
; NO_NEON-NOT: @llvm.aarch64.neon
; NO_NEON: ret void
;
%interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
%v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
%v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
ret void
}
define void @load_factor3_wide(<24 x i32>* %ptr) {
; NEON-LABEL: @load_factor3_wide(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]])
; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor3_wide(
; NO_NEON-NOT: @llvm.aarch64.neon
; NO_NEON: ret void
;
%interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
%v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
%v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
%v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
ret void
}
define void @load_factor4_wide(<32 x i32>* %ptr) {
; NEON-LABEL: @load_factor4_wide(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]])
; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3
; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor4_wide(
; NO_NEON-NOT: @llvm.aarch64.neon
; NO_NEON: ret void
;
%interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
%v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
%v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
%v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
ret void
}
define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
; NEON-LABEL: @store_factor2_wide(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
; NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]])
; NEON-NEXT: ret void
; NO_NEON-LABEL: @store_factor2_wide(
; NO_NEON: ret void
;
%interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
ret void
}
define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
; NEON-LABEL: @store_factor3_wide(
; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; NEON-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]])
; NEON-NEXT: ret void
; NO_NEON-LABEL: @store_factor3_wide(
; NO_NEON: ret void
;
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
ret void
}
define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
; NEON-LABEL: @store_factor4_wide(
; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]])
; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
; NEON-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
; NEON-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]])
; NEON-NEXT: ret void
; NO_NEON-LABEL: @store_factor4_wide(
; NO_NEON-NOT: @llvm.aarch64.neon
; NO_NEON: ret void
;
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
ret void
}

View File

@ -646,3 +646,200 @@ define void @no_interleave(<4 x float> %a0) {
store <4 x float> %v0, <4 x float>* @g, align 16 store <4 x float> %v0, <4 x float>* @g, align 16
ret void ret void
} }
define void @load_factor2_wide2(<16 x i32>* %ptr) {
; NEON-LABEL: @load_factor2_wide2(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor2_wide2(
; NO_NEON-NOT: @llvm.arm.neon
; NO_NEON: ret void
;
%interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
%v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret void
}
define void @load_factor2_wide3(<24 x i32>* %ptr) {
; NEON-LABEL: @load_factor2_wide3(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
; NEON-NEXT: [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; NEON-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; NEON-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor2_wide3(
; NO_NEON-NOT: @llvm.arm.neon
; NO_NEON: ret void
;
%interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
%v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
%v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
ret void
}
define void @load_factor3_wide(<24 x i32>* %ptr) {
; NEON-LABEL: @load_factor3_wide(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor3_wide(
; NO_NEON-NOT: @llvm.arm.neon
; NO_NEON: ret void
;
%interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
%v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
%v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
%v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
ret void
}
define void @load_factor4_wide(<32 x i32>* %ptr) {
; NEON-LABEL: @load_factor4_wide(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: ret void
; NO_NEON-LABEL: @load_factor4_wide(
; NO_NEON-NOT: @llvm.arm.neon
; NO_NEON: ret void
;
%interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
%v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
%v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
%v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
ret void
}
define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
; NEON-LABEL: @store_factor2_wide(
; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
; NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
; NEON-NEXT: ret void
; NO_NEON-LABEL: @store_factor2_wide(
; NO_NEON-NOT: @llvm.arm.neon
; NO_NEON: ret void
;
%interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
ret void
}
define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
; NEON-LABEL: @store_factor3_wide(
; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
; NEON-NEXT: ret void
; NO_NEON-LABEL: @store_factor3_wide(
; NO_NEON-NOT: @llvm.arm.neon
; NO_NEON: ret void
;
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
ret void
}
define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
; NEON-LABEL: @store_factor4_wide(
; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; NEON-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
; NEON-NEXT: ret void
; NO_NEON-LABEL: @store_factor4_wide(
; NO_NEON-NOT: @llvm.arm.neon
; NO_NEON: ret void
;
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
ret void
}