Implemented cost model for masked gather and scatter operations

The cost is calculated for all X86 targets. When gather/scatter instruction
is not supported we calculate the cost of scalar sequence.

Differential revision: http://reviews.llvm.org/D15677



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256519 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2015-12-28 20:10:59 +00:00
parent d0dc794073
commit 84f6badccc
8 changed files with 449 additions and 6 deletions

View File

@ -458,6 +458,16 @@ public:
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const;
/// \return The cost of Gather or Scatter operation
/// \p Opcode - is a type of memory access Load or Store
/// \p DataTy - a vector type of the data to be loaded or stored
/// \p Ptr - pointer [or vector of pointers] - address[es] in memory
/// \p VariableMask - true when the memory access is predicated with a mask
/// that is not a compile-time constant
/// \p Alignment - alignment of single element
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
bool VariableMask, unsigned Alignment) const;
/// \return The cost of the interleaved memory operation.
/// \p Opcode is the memory operation code
/// \p VecTy is the vector type of the interleaved access.
@ -485,10 +495,14 @@ public:
/// ((v0+v2), (v1+v3), undef, undef)
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;
/// \returns The cost of Intrinsic instructions.
/// \returns The cost of Intrinsic instructions. Types analysis only.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) const;
/// \returns The cost of Intrinsic instructions. Analyses the real arguments.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args) const;
/// \returns The cost of Call instructions.
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
@ -614,6 +628,9 @@ public:
virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment,
unsigned AddressSpace) = 0;
virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
Value *Ptr, bool VariableMask,
unsigned Alignment) = 0;
virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
@ -623,6 +640,8 @@ public:
bool IsPairwiseForm) = 0;
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) = 0;
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args) = 0;
virtual int getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
@ -791,6 +810,12 @@ public:
unsigned AddressSpace) override {
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
}
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
Value *Ptr, bool VariableMask,
unsigned Alignment) override {
return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
Alignment);
}
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace) override {
@ -805,6 +830,10 @@ public:
ArrayRef<Type *> Tys) override {
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
}
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args) override {
return Impl.getIntrinsicInstrCost(ID, RetTy, Args);
}
int getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) override {
return Impl.getCallInstrCost(F, RetTy, Tys);

View File

@ -301,6 +301,12 @@ public:
return 1;
}
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
bool VariableMask,
unsigned Alignment) {
return 1;
}
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
@ -313,6 +319,10 @@ public:
ArrayRef<Type *> Tys) {
return 1;
}
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args) {
return 1;
}
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
return 1;

View File

@ -580,6 +580,39 @@ public:
return Cost;
}
/// Get intrinsic cost based on arguments
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Value *> Args) {
switch (IID) {
default: {
SmallVector<Type *, 4> Types;
for (Value *Op : Args)
Types.push_back(Op->getType());
return getIntrinsicInstrCost(IID, RetTy, Types);
}
case Intrinsic::masked_scatter: {
Value *Mask = Args[3];
bool VarMask = !isa<Constant>(Mask);
unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
return
static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
Args[0]->getType(),
Args[1], VarMask,
Alignment);
}
case Intrinsic::masked_gather: {
Value *Mask = Args[2];
bool VarMask = !isa<Constant>(Mask);
unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
return
static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
RetTy, Args[0], VarMask,
Alignment);
}
}
}
/// Get intrinsic cost based on argument types
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys) {
unsigned ISD = 0;

View File

@ -500,12 +500,12 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
}
case Instruction::Call:
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
SmallVector<Type*, 4> Tys;
SmallVector<Value *, 4> Args;
for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
Tys.push_back(II->getArgOperand(J)->getType());
Args.push_back(II->getArgOperand(J));
return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
Tys);
Args);
}
return -1;
default:

View File

@ -280,6 +280,15 @@ int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
return Cost;
}
int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
Value *Ptr, bool VariableMask,
unsigned Alignment) const {
int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
Alignment);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
int TargetTransformInfo::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace) const {
@ -296,6 +305,13 @@ int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
return Cost;
}
int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args) const {
int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) const {
int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys);

View File

@ -1297,6 +1297,142 @@ int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
return X86TTIImpl::getIntImmCost(Imm, Ty);
}
// Return an average cost of Gather / Scatter instruction, maybe improved later
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
unsigned Alignment, unsigned AddressSpace) {
assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
unsigned VF = SrcVTy->getVectorNumElements();
// Try to reduce index size from 64 bit (default for GEP)
// to 32. It is essential for VF 16. If the index can't be reduced to 32, the
// operation will use 16 x 64 indices which do not fit in a zmm and needs
// to split. Also check that the base pointer is the same for all lanes,
// and that there's at most one variable index.
auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
unsigned IndexSize = DL.getPointerSizeInBits();
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (IndexSize < 64 || !GEP)
return IndexSize;
unsigned NumOfVarIndices = 0;
Value *Ptrs = GEP->getPointerOperand();
if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
return IndexSize;
for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
if (isa<Constant>(GEP->getOperand(i)))
continue;
Type *IndxTy = GEP->getOperand(i)->getType();
if (IndxTy->isVectorTy())
IndxTy = IndxTy->getVectorElementType();
if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
!isa<SExtInst>(GEP->getOperand(i))) ||
++NumOfVarIndices > 1)
return IndexSize; // 64
}
return (unsigned)32;
};
// Trying to reduce IndexSize to 32 bits for vector 16.
// By default the IndexSize is equal to pointer size.
unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
DL.getPointerSizeInBits();
Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
IndexSize), VF);
std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
if (SplitFactor > 1) {
// Handle splitting of vector of pointers
Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
AddressSpace);
}
// The gather / scatter cost is given by Intel architects. It is a rough
// number since we are looking at one instruction in a time.
const int GSOverhead = 2;
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);
}
/// Return the cost of full scalarization of gather / scatter operation.
///
/// Opcode - Load or Store instruction.
/// SrcVTy - The type of the data vector that should be gathered or scattered.
/// VariableMask - The mask is non-constant at compile time.
/// Alignment - Alignment for one element.
/// AddressSpace - pointer[s] address space.
///
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
bool VariableMask, unsigned Alignment,
unsigned AddressSpace) {
unsigned VF = SrcVTy->getVectorNumElements();
int MaskUnpackCost = 0;
if (VariableMask) {
VectorType *MaskTy =
VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
int ScalarCompareCost =
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
nullptr);
int BranchCost = getCFInstrCost(Instruction::Br);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}
// The cost of the scalar loads/stores.
int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);
int InsertExtractCost = 0;
if (Opcode == Instruction::Load)
for (unsigned i = 0; i < VF; ++i)
// Add the cost of inserting each scalar load into the vector
InsertExtractCost +=
getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
else
for (unsigned i = 0; i < VF; ++i)
// Add the cost of extracting each element out of the data vector
InsertExtractCost +=
getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
}
/// Calculate the cost of Gather / Scatter operation
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
Value *Ptr, bool VariableMask,
unsigned Alignment) {
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
unsigned VF = SrcVTy->getVectorNumElements();
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy && Ptr->getType()->isVectorTy())
PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
assert(PtrTy && "Unexpected type for Ptr argument");
unsigned AddressSpace = PtrTy->getAddressSpace();
bool Scalarize = false;
if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
(Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
Scalarize = true;
// Gather / Scatter for vector 2 is not profitable on KNL / SKX
// Vector-4 of gather/scatter instruction does not exist on KNL.
// We can extend it to 8 elements, but zeroing upper bits of
// the mask vector will add more instructions. Right now we give the scalar
// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
// better in the VariableMask case.
if (VF == 2 || (VF == 4 && !ST->hasVLX()))
Scalarize = true;
if (Scalarize)
return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?

View File

@ -76,7 +76,8 @@ public:
unsigned AddressSpace);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
bool VariableMask, unsigned Alignment);
int getAddressComputationCost(Type *PtrTy, bool IsComplex);
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
@ -94,6 +95,11 @@ public:
bool isLegalMaskedScatter(Type *DataType);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
unsigned Alignment, unsigned AddressSpace);
int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
unsigned Alignment, unsigned AddressSpace);
/// @}
};

View File

@ -1,4 +1,6 @@
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX
; AVX2-LABEL: test1
@ -65,6 +67,217 @@ define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
ret <2 x i32> %res
}
define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) {
; AVX2-LABEL: test_gather_2f64
; AVX2: Found an estimated cost of 7 {{.*}}.gather
; KNL-LABEL: test_gather_2f64
; KNL: Found an estimated cost of 7 {{.*}}.gather
; SKX-LABEL: test_gather_2f64
; SKX: Found an estimated cost of 7 {{.*}}.gather
%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
ret <2 x double> %res
}
declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) {
; AVX2-LABEL: test_gather_4i32
; AVX2: Found an estimated cost of 16 {{.*}}.gather
; KNL-LABEL: test_gather_4i32
; KNL: Found an estimated cost of 16 {{.*}}.gather
; SKX-LABEL: test_gather_4i32
; SKX: Found an estimated cost of 6 {{.*}}.gather
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
ret <4 x i32> %res
}
define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) {
; AVX2-LABEL: test_gather_4i32_const_mask
; AVX2: Found an estimated cost of 8 {{.*}}.gather
; KNL-LABEL: test_gather_4i32_const_mask
; KNL: Found an estimated cost of 8 {{.*}}.gather
; SKX-LABEL: test_gather_4i32_const_mask
; SKX: Found an estimated cost of 6 {{.*}}.gather
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
ret <4 x i32> %res
}
declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {
; AVX2-LABEL: test_gather_16f32_const_mask
; AVX2: Found an estimated cost of 30 {{.*}}.gather
; KNL-LABEL: test_gather_16f32_const_mask
; KNL: Found an estimated cost of 18 {{.*}}.gather
; SKX-LABEL: test_gather_16f32_const_mask
; SKX: Found an estimated cost of 18 {{.*}}.gather
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {
; AVX2-LABEL: test_gather_16f32_var_mask
; AVX2: Found an estimated cost of 62 {{.*}}.gather
; KNL-LABEL: test_gather_16f32_var_mask
; KNL: Found an estimated cost of 18 {{.*}}.gather
; SKX-LABEL: test_gather_16f32_var_mask
; SKX: Found an estimated cost of 18 {{.*}}.gather
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
ret <16 x float>%res
}
define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
; AVX2-LABEL: test_gather_16f32_ra_var_mask
; AVX2: Found an estimated cost of 62 {{.*}}.gather
; KNL-LABEL: test_gather_16f32_ra_var_mask
; KNL: Found an estimated cost of 20 {{.*}}.gather
; SKX-LABEL: test_gather_16f32_ra_var_mask
; SKX: Found an estimated cost of 20 {{.*}}.gather
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
ret <16 x float>%res
}
define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {
; AVX2-LABEL: test_gather_16f32_const_mask2
; AVX2: Found an estimated cost of 30 {{.*}}.gather
; KNL-LABEL: test_gather_16f32_const_mask2
; KNL: Found an estimated cost of 18 {{.*}}.gather
; SKX-LABEL: test_gather_16f32_const_mask2
; SKX: Found an estimated cost of 18 {{.*}}.gather
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res
}
define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; AVX2-LABEL: test_scatter_16i32
; AVX2: Found an estimated cost of 64 {{.*}}.scatter
; KNL-LABEL: test_scatter_16i32
; KNL: Found an estimated cost of 18 {{.*}}.scatter
; SKX-LABEL: test_scatter_16i32
; SKX: Found an estimated cost of 18 {{.*}}.scatter
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
%imask = bitcast i16 %mask to <16 x i1>
call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
ret void
}
define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
; AVX2-LABEL: test_scatter_8i32
; AVX2: Found an estimated cost of 32 {{.*}}.scatter
; KNL-LABEL: test_scatter_8i32
; KNL: Found an estimated cost of 10 {{.*}}.scatter
; SKX-LABEL: test_scatter_8i32
; SKX: Found an estimated cost of 10 {{.*}}.scatter
call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
ret void
}
declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
; AVX2-LABEL: test_scatter_4i32
; AVX2: Found an estimated cost of 16 {{.*}}.scatter
; KNL-LABEL: test_scatter_4i32
; KNL: Found an estimated cost of 16 {{.*}}.scatter
; SKX-LABEL: test_scatter_4i32
; SKX: Found an estimated cost of 6 {{.*}}.scatter
call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
ret void
}
define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) {
; AVX2-LABEL: test_gather_4f32
; AVX2: Found an estimated cost of 15 {{.*}}.gather
; KNL-LABEL: test_gather_4f32
; KNL: Found an estimated cost of 15 {{.*}}.gather
; SKX-LABEL: test_gather_4f32
; SKX: Found an estimated cost of 6 {{.*}}.gather
%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
%res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
ret <4 x float>%res
}
define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
; AVX2-LABEL: test_gather_4f32_const_mask
; AVX2: Found an estimated cost of 7 {{.*}}.gather
; KNL-LABEL: test_gather_4f32_const_mask
; KNL: Found an estimated cost of 7 {{.*}}.gather
; SKX-LABEL: test_gather_4f32_const_mask
; SKX: Found an estimated cost of 6 {{.*}}.gather
%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
%res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
ret <4 x float>%res
}
declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)