mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-28 06:00:28 +00:00
Implemented cost model for masked gather and scatter operations
The cost is calculated for all X86 targets. When gather/scatter instruction is not supported we calculate the cost of scalar sequence. Differential revision: http://reviews.llvm.org/D15677 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256519 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d0dc794073
commit
84f6badccc
@ -458,6 +458,16 @@ public:
|
||||
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
unsigned AddressSpace) const;
|
||||
|
||||
/// \return The cost of Gather or Scatter operation
|
||||
/// \p Opcode - is a type of memory access Load or Store
|
||||
/// \p DataTy - a vector type of the data to be loaded or stored
|
||||
/// \p Ptr - pointer [or vector of pointers] - address[es] in memory
|
||||
/// \p VariableMask - true when the memory access is predicated with a mask
|
||||
/// that is not a compile-time constant
|
||||
/// \p Alignment - alignment of single element
|
||||
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment) const;
|
||||
|
||||
/// \return The cost of the interleaved memory operation.
|
||||
/// \p Opcode is the memory operation code
|
||||
/// \p VecTy is the vector type of the interleaved access.
|
||||
@ -485,10 +495,14 @@ public:
|
||||
/// ((v0+v2), (v1+v3), undef, undef)
|
||||
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;
|
||||
|
||||
/// \returns The cost of Intrinsic instructions.
|
||||
/// \returns The cost of Intrinsic instructions. Types analysis only.
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) const;
|
||||
|
||||
/// \returns The cost of Intrinsic instructions. Analyses the real arguments.
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args) const;
|
||||
|
||||
/// \returns The cost of Call instructions.
|
||||
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
|
||||
|
||||
@ -614,6 +628,9 @@ public:
|
||||
virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) = 0;
|
||||
virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) = 0;
|
||||
virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
@ -623,6 +640,8 @@ public:
|
||||
bool IsPairwiseForm) = 0;
|
||||
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) = 0;
|
||||
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args) = 0;
|
||||
virtual int getCallInstrCost(Function *F, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) = 0;
|
||||
virtual unsigned getNumberOfParts(Type *Tp) = 0;
|
||||
@ -791,6 +810,12 @@ public:
|
||||
unsigned AddressSpace) override {
|
||||
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
||||
}
|
||||
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) override {
|
||||
return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||
Alignment);
|
||||
}
|
||||
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
|
||||
ArrayRef<unsigned> Indices, unsigned Alignment,
|
||||
unsigned AddressSpace) override {
|
||||
@ -805,6 +830,10 @@ public:
|
||||
ArrayRef<Type *> Tys) override {
|
||||
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
|
||||
}
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args) override {
|
||||
return Impl.getIntrinsicInstrCost(ID, RetTy, Args);
|
||||
}
|
||||
int getCallInstrCost(Function *F, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) override {
|
||||
return Impl.getCallInstrCost(F, RetTy, Tys);
|
||||
|
@ -301,6 +301,12 @@ public:
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask,
|
||||
unsigned Alignment) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
@ -313,6 +319,10 @@ public:
|
||||
ArrayRef<Type *> Tys) {
|
||||
return 1;
|
||||
}
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
|
||||
return 1;
|
||||
|
@ -580,6 +580,39 @@ public:
|
||||
return Cost;
|
||||
}
|
||||
|
||||
/// Get intrinsic cost based on arguments
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Value *> Args) {
|
||||
switch (IID) {
|
||||
default: {
|
||||
SmallVector<Type *, 4> Types;
|
||||
for (Value *Op : Args)
|
||||
Types.push_back(Op->getType());
|
||||
return getIntrinsicInstrCost(IID, RetTy, Types);
|
||||
}
|
||||
case Intrinsic::masked_scatter: {
|
||||
Value *Mask = Args[3];
|
||||
bool VarMask = !isa<Constant>(Mask);
|
||||
unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
|
||||
return
|
||||
static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
|
||||
Args[0]->getType(),
|
||||
Args[1], VarMask,
|
||||
Alignment);
|
||||
}
|
||||
case Intrinsic::masked_gather: {
|
||||
Value *Mask = Args[2];
|
||||
bool VarMask = !isa<Constant>(Mask);
|
||||
unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
|
||||
return
|
||||
static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
|
||||
RetTy, Args[0], VarMask,
|
||||
Alignment);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get intrinsic cost based on argument types
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) {
|
||||
unsigned ISD = 0;
|
||||
|
@ -500,12 +500,12 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
|
||||
}
|
||||
case Instruction::Call:
|
||||
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
||||
SmallVector<Type*, 4> Tys;
|
||||
SmallVector<Value *, 4> Args;
|
||||
for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
|
||||
Tys.push_back(II->getArgOperand(J)->getType());
|
||||
Args.push_back(II->getArgOperand(J));
|
||||
|
||||
return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
|
||||
Tys);
|
||||
Args);
|
||||
}
|
||||
return -1;
|
||||
default:
|
||||
|
@ -280,6 +280,15 @@ int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) const {
|
||||
int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||
Alignment);
|
||||
assert(Cost >= 0 && "TTI should not produce negative costs!");
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getInterleavedMemoryOpCost(
|
||||
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment, unsigned AddressSpace) const {
|
||||
@ -296,6 +305,13 @@ int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args) const {
|
||||
int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args);
|
||||
assert(Cost >= 0 && "TTI should not produce negative costs!");
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) const {
|
||||
int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys);
|
||||
|
@ -1297,6 +1297,142 @@ int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
|
||||
return X86TTIImpl::getIntImmCost(Imm, Ty);
|
||||
}
|
||||
|
||||
// Return an average cost of Gather / Scatter instruction, maybe improved later
|
||||
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
|
||||
unsigned Alignment, unsigned AddressSpace) {
|
||||
|
||||
assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
|
||||
unsigned VF = SrcVTy->getVectorNumElements();
|
||||
|
||||
// Try to reduce index size from 64 bit (default for GEP)
|
||||
// to 32. It is essential for VF 16. If the index can't be reduced to 32, the
|
||||
// operation will use 16 x 64 indices which do not fit in a zmm and needs
|
||||
// to split. Also check that the base pointer is the same for all lanes,
|
||||
// and that there's at most one variable index.
|
||||
auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
|
||||
unsigned IndexSize = DL.getPointerSizeInBits();
|
||||
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
|
||||
if (IndexSize < 64 || !GEP)
|
||||
return IndexSize;
|
||||
|
||||
unsigned NumOfVarIndices = 0;
|
||||
Value *Ptrs = GEP->getPointerOperand();
|
||||
if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
|
||||
return IndexSize;
|
||||
for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
|
||||
if (isa<Constant>(GEP->getOperand(i)))
|
||||
continue;
|
||||
Type *IndxTy = GEP->getOperand(i)->getType();
|
||||
if (IndxTy->isVectorTy())
|
||||
IndxTy = IndxTy->getVectorElementType();
|
||||
if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
|
||||
!isa<SExtInst>(GEP->getOperand(i))) ||
|
||||
++NumOfVarIndices > 1)
|
||||
return IndexSize; // 64
|
||||
}
|
||||
return (unsigned)32;
|
||||
};
|
||||
|
||||
|
||||
// Trying to reduce IndexSize to 32 bits for vector 16.
|
||||
// By default the IndexSize is equal to pointer size.
|
||||
unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
|
||||
DL.getPointerSizeInBits();
|
||||
|
||||
Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
|
||||
IndexSize), VF);
|
||||
std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
|
||||
std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
|
||||
int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
|
||||
if (SplitFactor > 1) {
|
||||
// Handle splitting of vector of pointers
|
||||
Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
|
||||
return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
|
||||
AddressSpace);
|
||||
}
|
||||
|
||||
// The gather / scatter cost is given by Intel architects. It is a rough
|
||||
// number since we are looking at one instruction in a time.
|
||||
const int GSOverhead = 2;
|
||||
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
|
||||
Alignment, AddressSpace);
|
||||
}
|
||||
|
||||
/// Return the cost of full scalarization of gather / scatter operation.
|
||||
///
|
||||
/// Opcode - Load or Store instruction.
|
||||
/// SrcVTy - The type of the data vector that should be gathered or scattered.
|
||||
/// VariableMask - The mask is non-constant at compile time.
|
||||
/// Alignment - Alignment for one element.
|
||||
/// AddressSpace - pointer[s] address space.
|
||||
///
|
||||
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
|
||||
bool VariableMask, unsigned Alignment,
|
||||
unsigned AddressSpace) {
|
||||
unsigned VF = SrcVTy->getVectorNumElements();
|
||||
|
||||
int MaskUnpackCost = 0;
|
||||
if (VariableMask) {
|
||||
VectorType *MaskTy =
|
||||
VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
|
||||
MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
|
||||
int ScalarCompareCost =
|
||||
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
|
||||
nullptr);
|
||||
int BranchCost = getCFInstrCost(Instruction::Br);
|
||||
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
|
||||
}
|
||||
|
||||
// The cost of the scalar loads/stores.
|
||||
int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
|
||||
Alignment, AddressSpace);
|
||||
|
||||
int InsertExtractCost = 0;
|
||||
if (Opcode == Instruction::Load)
|
||||
for (unsigned i = 0; i < VF; ++i)
|
||||
// Add the cost of inserting each scalar load into the vector
|
||||
InsertExtractCost +=
|
||||
getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
|
||||
else
|
||||
for (unsigned i = 0; i < VF; ++i)
|
||||
// Add the cost of extracting each element out of the data vector
|
||||
InsertExtractCost +=
|
||||
getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
|
||||
|
||||
return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
|
||||
}
|
||||
|
||||
/// Calculate the cost of Gather / Scatter operation
|
||||
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) {
|
||||
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
|
||||
unsigned VF = SrcVTy->getVectorNumElements();
|
||||
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
|
||||
if (!PtrTy && Ptr->getType()->isVectorTy())
|
||||
PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
|
||||
assert(PtrTy && "Unexpected type for Ptr argument");
|
||||
unsigned AddressSpace = PtrTy->getAddressSpace();
|
||||
|
||||
bool Scalarize = false;
|
||||
if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
|
||||
(Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
|
||||
Scalarize = true;
|
||||
// Gather / Scatter for vector 2 is not profitable on KNL / SKX
|
||||
// Vector-4 of gather/scatter instruction does not exist on KNL.
|
||||
// We can extend it to 8 elements, but zeroing upper bits of
|
||||
// the mask vector will add more instructions. Right now we give the scalar
|
||||
// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
|
||||
// better in the VariableMask case.
|
||||
if (VF == 2 || (VF == 4 && !ST->hasVLX()))
|
||||
Scalarize = true;
|
||||
|
||||
if (Scalarize)
|
||||
return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
|
||||
|
||||
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
|
||||
}
|
||||
|
||||
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
|
||||
Type *ScalarTy = DataTy->getScalarType();
|
||||
int DataWidth = isa<PointerType>(ScalarTy) ?
|
||||
|
@ -76,7 +76,8 @@ public:
|
||||
unsigned AddressSpace);
|
||||
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
unsigned AddressSpace);
|
||||
|
||||
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment);
|
||||
int getAddressComputationCost(Type *PtrTy, bool IsComplex);
|
||||
|
||||
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
|
||||
@ -94,6 +95,11 @@ public:
|
||||
bool isLegalMaskedScatter(Type *DataType);
|
||||
bool areInlineCompatible(const Function *Caller,
|
||||
const Function *Callee) const;
|
||||
private:
|
||||
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
|
||||
unsigned Alignment, unsigned AddressSpace);
|
||||
int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
unsigned Alignment, unsigned AddressSpace);
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
@ -1,4 +1,6 @@
|
||||
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2
|
||||
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2
|
||||
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL
|
||||
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX
|
||||
|
||||
|
||||
; AVX2-LABEL: test1
|
||||
@ -65,6 +67,217 @@ define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
|
||||
ret <2 x i32> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) {
|
||||
|
||||
; AVX2-LABEL: test_gather_2f64
|
||||
; AVX2: Found an estimated cost of 7 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_2f64
|
||||
; KNL: Found an estimated cost of 7 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_2f64
|
||||
; SKX: Found an estimated cost of 7 {{.*}}.gather
|
||||
|
||||
%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
|
||||
ret <2 x double> %res
|
||||
}
|
||||
declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
|
||||
|
||||
define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) {
|
||||
|
||||
; AVX2-LABEL: test_gather_4i32
|
||||
; AVX2: Found an estimated cost of 16 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_4i32
|
||||
; KNL: Found an estimated cost of 16 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_4i32
|
||||
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||
|
||||
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
|
||||
define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) {
|
||||
|
||||
; AVX2-LABEL: test_gather_4i32_const_mask
|
||||
; AVX2: Found an estimated cost of 8 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_4i32_const_mask
|
||||
; KNL: Found an estimated cost of 8 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_4i32_const_mask
|
||||
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||
|
||||
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
|
||||
|
||||
define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {
|
||||
|
||||
; AVX2-LABEL: test_gather_16f32_const_mask
|
||||
; AVX2: Found an estimated cost of 30 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_16f32_const_mask
|
||||
; KNL: Found an estimated cost of 18 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_16f32_const_mask
|
||||
; SKX: Found an estimated cost of 18 {{.*}}.gather
|
||||
|
||||
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
|
||||
|
||||
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
|
||||
ret <16 x float>%res
|
||||
}
|
||||
|
||||
define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {
|
||||
|
||||
; AVX2-LABEL: test_gather_16f32_var_mask
|
||||
; AVX2: Found an estimated cost of 62 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_16f32_var_mask
|
||||
; KNL: Found an estimated cost of 18 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_16f32_var_mask
|
||||
; SKX: Found an estimated cost of 18 {{.*}}.gather
|
||||
|
||||
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
|
||||
|
||||
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
|
||||
ret <16 x float>%res
|
||||
}
|
||||
|
||||
define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
|
||||
|
||||
; AVX2-LABEL: test_gather_16f32_ra_var_mask
|
||||
; AVX2: Found an estimated cost of 62 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_16f32_ra_var_mask
|
||||
; KNL: Found an estimated cost of 20 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_16f32_ra_var_mask
|
||||
; SKX: Found an estimated cost of 20 {{.*}}.gather
|
||||
|
||||
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||
%gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
|
||||
|
||||
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
|
||||
ret <16 x float>%res
|
||||
}
|
||||
|
||||
define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {
|
||||
|
||||
; AVX2-LABEL: test_gather_16f32_const_mask2
|
||||
; AVX2: Found an estimated cost of 30 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_16f32_const_mask2
|
||||
; KNL: Found an estimated cost of 18 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_16f32_const_mask2
|
||||
; SKX: Found an estimated cost of 18 {{.*}}.gather
|
||||
|
||||
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
|
||||
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
|
||||
|
||||
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
|
||||
|
||||
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
|
||||
ret <16 x float>%res
|
||||
}
|
||||
|
||||
define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
|
||||
; AVX2-LABEL: test_scatter_16i32
|
||||
; AVX2: Found an estimated cost of 64 {{.*}}.scatter
|
||||
|
||||
; KNL-LABEL: test_scatter_16i32
|
||||
; KNL: Found an estimated cost of 18 {{.*}}.scatter
|
||||
|
||||
; SKX-LABEL: test_scatter_16i32
|
||||
; SKX: Found an estimated cost of 18 {{.*}}.scatter
|
||||
|
||||
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
|
||||
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
|
||||
|
||||
%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
|
||||
%imask = bitcast i16 %mask to <16 x i1>
|
||||
call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
|
||||
; AVX2-LABEL: test_scatter_8i32
|
||||
; AVX2: Found an estimated cost of 32 {{.*}}.scatter
|
||||
|
||||
; KNL-LABEL: test_scatter_8i32
|
||||
; KNL: Found an estimated cost of 10 {{.*}}.scatter
|
||||
|
||||
; SKX-LABEL: test_scatter_8i32
|
||||
; SKX: Found an estimated cost of 10 {{.*}}.scatter
|
||||
|
||||
call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
|
||||
|
||||
define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
|
||||
; AVX2-LABEL: test_scatter_4i32
|
||||
; AVX2: Found an estimated cost of 16 {{.*}}.scatter
|
||||
|
||||
; KNL-LABEL: test_scatter_4i32
|
||||
; KNL: Found an estimated cost of 16 {{.*}}.scatter
|
||||
|
||||
; SKX-LABEL: test_scatter_4i32
|
||||
; SKX: Found an estimated cost of 6 {{.*}}.scatter
|
||||
|
||||
call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) {
|
||||
|
||||
; AVX2-LABEL: test_gather_4f32
|
||||
; AVX2: Found an estimated cost of 15 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_4f32
|
||||
; KNL: Found an estimated cost of 15 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_4f32
|
||||
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||
|
||||
%sext_ind = sext <4 x i32> %ind to <4 x i64>
|
||||
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
|
||||
|
||||
%res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||
ret <4 x float>%res
|
||||
}
|
||||
|
||||
define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
|
||||
|
||||
; AVX2-LABEL: test_gather_4f32_const_mask
|
||||
; AVX2: Found an estimated cost of 7 {{.*}}.gather
|
||||
|
||||
; KNL-LABEL: test_gather_4f32_const_mask
|
||||
; KNL: Found an estimated cost of 7 {{.*}}.gather
|
||||
|
||||
; SKX-LABEL: test_gather_4f32_const_mask
|
||||
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||
|
||||
%sext_ind = sext <4 x i32> %ind to <4 x i64>
|
||||
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
|
||||
|
||||
%res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
|
||||
ret <4 x float>%res
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
|
||||
declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
|
||||
declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
|
||||
declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
|
||||
|
||||
declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
|
||||
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
|
||||
|
Loading…
Reference in New Issue
Block a user