mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-27 07:12:06 +00:00
Masked Vector Load and Store Intrinsics.
Introduced new target-independent intrinsics in order to support masked vector loads and stores. The loop vectorizer optimizes loops containing conditional memory accesses by generating these intrinsics for existing targets AVX2 and AVX-512. The vectorizer asks the target about availability of masked vector loads and stores. Added SDNodes for masked operations and lowering patterns for X86 code generator. Examples: <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32> %passthru, i32 4 /* align */, <16 x i1> %mask) declare void @llvm.masked.store.v8f64(i8* %addr, <8 x double> %value, i32 4, <8 x i1> %mask) Scalarizer for other targets (not AVX2/AVX-512) will be done in a separate patch. http://reviews.llvm.org/D6191 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222632 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
4f5aa5994e
commit
ae1ae2c3a1
@ -270,6 +270,13 @@ public:
|
||||
int64_t BaseOffset, bool HasBaseReg,
|
||||
int64_t Scale) const;
|
||||
|
||||
/// \brief Return true if the target works with masked instruction
|
||||
/// AVX2 allows masks for consecutive load and store for i32 and i64 elements.
|
||||
/// AVX-512 architecture will also allow masks for non-consecutive memory
|
||||
/// accesses.
|
||||
virtual bool isLegalPredicatedStore(Type *DataType, int Consecutive) const;
|
||||
virtual bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const;
|
||||
|
||||
/// \brief Return the cost of the scaling factor used in the addressing
|
||||
/// mode represented by AM for this target, for a load/store
|
||||
/// of the specified type.
|
||||
|
@ -675,6 +675,9 @@ namespace ISD {
|
||||
ATOMIC_LOAD_UMIN,
|
||||
ATOMIC_LOAD_UMAX,
|
||||
|
||||
// Masked load and store
|
||||
MLOAD, MSTORE,
|
||||
|
||||
/// This corresponds to the llvm.lifetime.* intrinsics. The first operand
|
||||
/// is the chain and the second operand is the alloca pointer.
|
||||
LIFETIME_START, LIFETIME_END,
|
||||
|
@ -866,6 +866,10 @@ public:
|
||||
SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base,
|
||||
SDValue Offset, ISD::MemIndexedMode AM);
|
||||
|
||||
SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
|
||||
SDValue Mask, SDValue Src0, MachineMemOperand *MMO);
|
||||
SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
|
||||
SDValue Ptr, SDValue Mask, MachineMemOperand *MMO);
|
||||
/// getSrcValue - Construct a node to track a Value* through the backend.
|
||||
SDValue getSrcValue(const Value *v);
|
||||
|
||||
|
@ -1177,6 +1177,8 @@ public:
|
||||
N->getOpcode() == ISD::ATOMIC_LOAD_UMAX ||
|
||||
N->getOpcode() == ISD::ATOMIC_LOAD ||
|
||||
N->getOpcode() == ISD::ATOMIC_STORE ||
|
||||
N->getOpcode() == ISD::MLOAD ||
|
||||
N->getOpcode() == ISD::MSTORE ||
|
||||
N->isMemIntrinsic() ||
|
||||
N->isTargetMemoryOpcode();
|
||||
}
|
||||
@ -1926,6 +1928,72 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/// MaskedLoadStoreSDNode - This is a base class is used to represent MLOAD and
|
||||
/// MSTORE nodes
|
||||
///
|
||||
class MaskedLoadStoreSDNode : public MemSDNode {
|
||||
// Operands
|
||||
SDUse Ops[4];
|
||||
public:
|
||||
friend class SelectionDAG;
|
||||
MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl,
|
||||
SDValue *Operands, unsigned numOperands,
|
||||
SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
|
||||
: MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
|
||||
InitOperands(Ops, Operands, numOperands);
|
||||
}
|
||||
|
||||
// In the both nodes address is Op1, mask is Op2:
|
||||
// MaskedLoadSDNode (Chain, ptr, mask, src0), src0 is a passthru value
|
||||
// MaskedStoreSDNode (Chain, ptr, mask, data)
|
||||
// Mask is a vector of i1 elements
|
||||
const SDValue &getBasePtr() const { return getOperand(1); }
|
||||
const SDValue &getMask() const { return getOperand(2); }
|
||||
|
||||
static bool classof(const SDNode *N) {
|
||||
return N->getOpcode() == ISD::MLOAD ||
|
||||
N->getOpcode() == ISD::MSTORE;
|
||||
}
|
||||
};
|
||||
|
||||
/// MaskedLoadSDNode - This class is used to represent an MLOAD node
|
||||
///
|
||||
class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
|
||||
public:
|
||||
friend class SelectionDAG;
|
||||
MaskedLoadSDNode(unsigned Order, DebugLoc dl,
|
||||
SDValue *Operands, unsigned numOperands,
|
||||
SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
|
||||
: MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands,
|
||||
VTs, MemVT, MMO)
|
||||
{}
|
||||
|
||||
const SDValue &getSrc0() const { return getOperand(3); }
|
||||
static bool classof(const SDNode *N) {
|
||||
return N->getOpcode() == ISD::MLOAD;
|
||||
}
|
||||
};
|
||||
|
||||
/// MaskedStoreSDNode - This class is used to represent an MSTORE node
|
||||
///
|
||||
class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
|
||||
|
||||
public:
|
||||
friend class SelectionDAG;
|
||||
MaskedStoreSDNode(unsigned Order, DebugLoc dl,
|
||||
SDValue *Operands, unsigned numOperands,
|
||||
SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
|
||||
: MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands,
|
||||
VTs, MemVT, MMO)
|
||||
{}
|
||||
|
||||
const SDValue &getData() const { return getOperand(3); }
|
||||
|
||||
static bool classof(const SDNode *N) {
|
||||
return N->getOpcode() == ISD::MSTORE;
|
||||
}
|
||||
};
|
||||
|
||||
/// MachineSDNode - An SDNode that represents everything that will be needed
|
||||
/// to construct a MachineInstr. These nodes are created during the
|
||||
/// instruction selection proper phase.
|
||||
|
@ -429,11 +429,22 @@ public:
|
||||
/// If the pointer isn't i8* it will be converted.
|
||||
CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr);
|
||||
|
||||
/// \brief Create a call to Masked Load intrinsic
|
||||
CallInst *CreateMaskedLoad(ArrayRef<Value *> Ops);
|
||||
|
||||
/// \brief Create a call to Masked Store intrinsic
|
||||
CallInst *CreateMaskedStore(ArrayRef<Value *> Ops);
|
||||
|
||||
/// \brief Create an assume intrinsic call that allows the optimizer to
|
||||
/// assume that the provided condition will be true.
|
||||
CallInst *CreateAssumption(Value *Cond);
|
||||
|
||||
private:
|
||||
/// \brief Create a call to a masked intrinsic with given Id.
|
||||
/// Masked intrinsic has only one overloaded type - data type.
|
||||
CallInst *CreateMaskedIntrinsic(unsigned Id, ArrayRef<Value *> Ops,
|
||||
Type *DataTy);
|
||||
|
||||
Value *getCastedInt8PtrValue(Value *Ptr);
|
||||
};
|
||||
|
||||
|
@ -76,7 +76,8 @@ namespace Intrinsic {
|
||||
enum IITDescriptorKind {
|
||||
Void, VarArg, MMX, Metadata, Half, Float, Double,
|
||||
Integer, Vector, Pointer, Struct,
|
||||
Argument, ExtendArgument, TruncArgument, HalfVecArgument
|
||||
Argument, ExtendArgument, TruncArgument, HalfVecArgument,
|
||||
SameVecWidthArgument
|
||||
} Kind;
|
||||
|
||||
union {
|
||||
@ -96,13 +97,15 @@ namespace Intrinsic {
|
||||
};
|
||||
unsigned getArgumentNumber() const {
|
||||
assert(Kind == Argument || Kind == ExtendArgument ||
|
||||
Kind == TruncArgument || Kind == HalfVecArgument);
|
||||
Kind == TruncArgument || Kind == HalfVecArgument ||
|
||||
Kind == SameVecWidthArgument);
|
||||
return Argument_Info >> 2;
|
||||
}
|
||||
ArgKind getArgumentKind() const {
|
||||
assert(Kind == Argument || Kind == ExtendArgument ||
|
||||
Kind == TruncArgument || Kind == HalfVecArgument);
|
||||
return (ArgKind)(Argument_Info&3);
|
||||
Kind == TruncArgument || Kind == HalfVecArgument ||
|
||||
Kind == SameVecWidthArgument);
|
||||
return (ArgKind)(Argument_Info & 3);
|
||||
}
|
||||
|
||||
static IITDescriptor get(IITDescriptorKind K, unsigned Field) {
|
||||
|
@ -112,6 +112,10 @@ class LLVMMatchType<int num>
|
||||
// the intrinsic is overloaded, so the matched type should be declared as iAny.
|
||||
class LLVMExtendedType<int num> : LLVMMatchType<num>;
|
||||
class LLVMTruncatedType<int num> : LLVMMatchType<num>;
|
||||
class LLVMVectorSameWidth<int num, LLVMType elty>
|
||||
: LLVMMatchType<num> {
|
||||
ValueType ElTy = elty.VT;
|
||||
}
|
||||
|
||||
// Match the type of another intrinsic parameter that is expected to be a
|
||||
// vector type, but change the element count to be half as many
|
||||
@ -539,6 +543,17 @@ def int_convertuu : Intrinsic<[llvm_anyint_ty],
|
||||
def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
|
||||
[], "llvm.clear_cache">;
|
||||
|
||||
//===-------------------------- Masked Intrinsics -------------------------===//
|
||||
//
|
||||
def int_masked_store : Intrinsic<[], [llvm_ptr_ty, llvm_anyvector_ty,
|
||||
llvm_i32_ty,
|
||||
LLVMVectorSameWidth<0, llvm_i1_ty>],
|
||||
[IntrReadWriteArgMem]>;
|
||||
|
||||
def int_masked_load : Intrinsic<[llvm_anyvector_ty],
|
||||
[llvm_ptr_ty, LLVMMatchType<0>, llvm_i32_ty,
|
||||
LLVMVectorSameWidth<0, llvm_i1_ty>],
|
||||
[IntrReadArgMem]>;
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Target-specific intrinsics
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -188,6 +188,14 @@ def SDTIStore : SDTypeProfile<1, 3, [ // indexed store
|
||||
SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3>
|
||||
]>;
|
||||
|
||||
def SDTMaskedStore: SDTypeProfile<0, 3, [ // masked store
|
||||
SDTCisPtrTy<0>, SDTCisVec<1>, SDTCisVec<2>
|
||||
]>;
|
||||
|
||||
def SDTMaskedLoad: SDTypeProfile<1, 3, [ // masked load
|
||||
SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>
|
||||
]>;
|
||||
|
||||
def SDTVecShuffle : SDTypeProfile<1, 2, [
|
||||
SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
|
||||
]>;
|
||||
@ -454,6 +462,11 @@ def atomic_load : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad,
|
||||
def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
|
||||
def masked_store : SDNode<"ISD::MSTORE", SDTMaskedStore,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
def masked_load : SDNode<"ISD::MLOAD", SDTMaskedLoad,
|
||||
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
||||
|
||||
// Do not use ld, st directly. Use load, extload, sextload, zextload, store,
|
||||
// and truncst (see below).
|
||||
def ld : SDNode<"ISD::LOAD" , SDTLoad,
|
||||
|
@ -101,6 +101,17 @@ bool TargetTransformInfo::isLegalICmpImmediate(int64_t Imm) const {
|
||||
return PrevTTI->isLegalICmpImmediate(Imm);
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::isLegalPredicatedLoad(Type *DataType,
|
||||
int Consecutive) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::isLegalPredicatedStore(Type *DataType,
|
||||
int Consecutive) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
|
||||
int64_t BaseOffset,
|
||||
bool HasBaseReg,
|
||||
|
@ -303,6 +303,8 @@ namespace {
|
||||
SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
|
||||
SDValue visitVECTOR_SHUFFLE(SDNode *N);
|
||||
SDValue visitINSERT_SUBVECTOR(SDNode *N);
|
||||
SDValue visitMLOAD(SDNode *N);
|
||||
SDValue visitMSTORE(SDNode *N);
|
||||
|
||||
SDValue XformToShuffleWithZero(SDNode *N);
|
||||
SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
|
||||
@ -412,6 +414,7 @@ namespace {
|
||||
EVT getSetCCResultType(EVT VT) const {
|
||||
return TLI.getSetCCResultType(*DAG.getContext(), VT);
|
||||
}
|
||||
int& MLD();
|
||||
};
|
||||
}
|
||||
|
||||
@ -1351,6 +1354,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
|
||||
case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
|
||||
case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
|
||||
case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N);
|
||||
case ISD::MLOAD: return visitMLOAD(N);
|
||||
case ISD::MSTORE: return visitMSTORE(N);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
@ -4771,6 +4776,162 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
|
||||
TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitMSTORE(SDNode *N) {
|
||||
|
||||
if (Level >= AfterLegalizeTypes)
|
||||
return SDValue();
|
||||
|
||||
MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
|
||||
SDValue Mask = MST->getMask();
|
||||
SDValue Data = MST->getData();
|
||||
SDLoc DL(N);
|
||||
|
||||
// If the MSTORE data type requires splitting and the mask is provided by a
|
||||
// SETCC, then split both nodes and its operands before legalization. This
|
||||
// prevents the type legalizer from unrolling SETCC into scalar comparisons
|
||||
// and enables future optimizations (e.g. min/max pattern matching on X86).
|
||||
if (Mask.getOpcode() == ISD::SETCC) {
|
||||
|
||||
// Check if any splitting is required.
|
||||
if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
|
||||
TargetLowering::TypeSplitVector)
|
||||
return SDValue();
|
||||
|
||||
SDValue MaskLo, MaskHi, Lo, Hi;
|
||||
std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
|
||||
|
||||
EVT LoVT, HiVT;
|
||||
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0));
|
||||
|
||||
SDValue Chain = MST->getChain();
|
||||
SDValue Ptr = MST->getBasePtr();
|
||||
|
||||
EVT MemoryVT = MST->getMemoryVT();
|
||||
unsigned Alignment = MST->getOriginalAlignment();
|
||||
|
||||
// if Alignment is equal to the vector size,
|
||||
// take the half of it for the second part
|
||||
unsigned SecondHalfAlignment =
|
||||
(Alignment == Data->getValueType(0).getSizeInBits()/8) ?
|
||||
Alignment/2 : Alignment;
|
||||
|
||||
EVT LoMemVT, HiMemVT;
|
||||
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
|
||||
|
||||
SDValue DataLo, DataHi;
|
||||
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
|
||||
|
||||
MachineMemOperand *MMO = DAG.getMachineFunction().
|
||||
getMachineMemOperand(MST->getPointerInfo(),
|
||||
MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
|
||||
Alignment, MST->getAAInfo(), MST->getRanges());
|
||||
|
||||
Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO);
|
||||
|
||||
unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
|
||||
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
|
||||
DAG.getConstant(IncrementSize, Ptr.getValueType()));
|
||||
|
||||
MMO = DAG.getMachineFunction().
|
||||
getMachineMemOperand(MST->getPointerInfo(),
|
||||
MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
|
||||
SecondHalfAlignment, MST->getAAInfo(),
|
||||
MST->getRanges());
|
||||
|
||||
Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO);
|
||||
|
||||
AddToWorklist(Lo.getNode());
|
||||
AddToWorklist(Hi.getNode());
|
||||
|
||||
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitMLOAD(SDNode *N) {
|
||||
|
||||
if (Level >= AfterLegalizeTypes)
|
||||
return SDValue();
|
||||
|
||||
MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
|
||||
SDValue Mask = MLD->getMask();
|
||||
SDLoc DL(N);
|
||||
|
||||
// If the MLOAD result requires splitting and the mask is provided by a
|
||||
// SETCC, then split both nodes and its operands before legalization. This
|
||||
// prevents the type legalizer from unrolling SETCC into scalar comparisons
|
||||
// and enables future optimizations (e.g. min/max pattern matching on X86).
|
||||
|
||||
if (Mask.getOpcode() == ISD::SETCC) {
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// Check if any splitting is required.
|
||||
if (TLI.getTypeAction(*DAG.getContext(), VT) !=
|
||||
TargetLowering::TypeSplitVector)
|
||||
return SDValue();
|
||||
|
||||
SDValue MaskLo, MaskHi, Lo, Hi;
|
||||
std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
|
||||
|
||||
SDValue Src0 = MLD->getSrc0();
|
||||
SDValue Src0Lo, Src0Hi;
|
||||
std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
|
||||
|
||||
EVT LoVT, HiVT;
|
||||
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
|
||||
|
||||
SDValue Chain = MLD->getChain();
|
||||
SDValue Ptr = MLD->getBasePtr();
|
||||
EVT MemoryVT = MLD->getMemoryVT();
|
||||
unsigned Alignment = MLD->getOriginalAlignment();
|
||||
|
||||
// if Alignment is equal to the vector size,
|
||||
// take the half of it for the second part
|
||||
unsigned SecondHalfAlignment =
|
||||
(Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
|
||||
Alignment/2 : Alignment;
|
||||
|
||||
EVT LoMemVT, HiMemVT;
|
||||
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
|
||||
|
||||
MachineMemOperand *MMO = DAG.getMachineFunction().
|
||||
getMachineMemOperand(MLD->getPointerInfo(),
|
||||
MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
|
||||
Alignment, MLD->getAAInfo(), MLD->getRanges());
|
||||
|
||||
Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO);
|
||||
|
||||
unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
|
||||
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
|
||||
DAG.getConstant(IncrementSize, Ptr.getValueType()));
|
||||
|
||||
MMO = DAG.getMachineFunction().
|
||||
getMachineMemOperand(MLD->getPointerInfo(),
|
||||
MachineMemOperand::MOLoad, HiMemVT.getStoreSize(),
|
||||
SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
|
||||
|
||||
Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO);
|
||||
|
||||
AddToWorklist(Lo.getNode());
|
||||
AddToWorklist(Hi.getNode());
|
||||
|
||||
// Build a factor node to remember that this load is independent of the
|
||||
// other one.
|
||||
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
|
||||
Hi.getValue(1));
|
||||
|
||||
// Legalized the chain result - switch anything that used the old chain to
|
||||
// use the new one.
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
|
||||
|
||||
SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
|
||||
|
||||
SDValue RetOps[] = { LoadRes, Chain };
|
||||
return DAG.getMergeValues(RetOps, DL);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitVSELECT(SDNode *N) {
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
|
@ -825,6 +825,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
|
||||
case ISD::SINT_TO_FP: Res = PromoteIntOp_SINT_TO_FP(N); break;
|
||||
case ISD::STORE: Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
|
||||
OpNo); break;
|
||||
case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
|
||||
OpNo); break;
|
||||
case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
|
||||
OpNo); break;
|
||||
case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break;
|
||||
case ISD::FP16_TO_FP:
|
||||
case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break;
|
||||
@ -1091,6 +1095,25 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
|
||||
N->getMemoryVT(), N->getMemOperand());
|
||||
}
|
||||
|
||||
SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
|
||||
|
||||
assert(OpNo == 2 && "Only know how to promote the mask!");
|
||||
EVT DataVT = N->getOperand(3).getValueType();
|
||||
SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
|
||||
SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
|
||||
NewOps[OpNo] = Mask;
|
||||
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
|
||||
}
|
||||
|
||||
SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
|
||||
assert(OpNo == 2 && "Only know how to promote the mask!");
|
||||
EVT DataVT = N->getValueType(0);
|
||||
SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
|
||||
SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
|
||||
NewOps[OpNo] = Mask;
|
||||
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
|
||||
}
|
||||
|
||||
SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
|
||||
SDValue Op = GetPromotedInteger(N->getOperand(0));
|
||||
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
|
||||
|
@ -285,6 +285,8 @@ private:
|
||||
SDValue PromoteIntOp_TRUNCATE(SDNode *N);
|
||||
SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
|
||||
SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
|
||||
SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
|
||||
SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
|
||||
|
||||
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
|
||||
|
||||
@ -578,6 +580,7 @@ private:
|
||||
void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
@ -594,6 +597,7 @@ private:
|
||||
SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
|
||||
SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
|
||||
SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
|
||||
SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
|
||||
SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
|
||||
SDValue SplitVecOp_TRUNCATE(SDNode *N);
|
||||
SDValue SplitVecOp_VSETCC(SDNode *N);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -4917,6 +4917,60 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
|
||||
SDValue
|
||||
SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
|
||||
SDValue Ptr, SDValue Mask, SDValue Src0,
|
||||
MachineMemOperand *MMO) {
|
||||
|
||||
SDVTList VTs = getVTList(VT, MVT::Other);
|
||||
SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
|
||||
FoldingSetNodeID ID;
|
||||
AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
|
||||
ID.AddInteger(VT.getRawBits());
|
||||
ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED,
|
||||
MMO->isVolatile(),
|
||||
MMO->isNonTemporal(),
|
||||
MMO->isInvariant()));
|
||||
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
|
||||
void *IP = nullptr;
|
||||
if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
|
||||
cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
|
||||
return SDValue(E, 0);
|
||||
}
|
||||
SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
|
||||
dl.getDebugLoc(), Ops, 4, VTs,
|
||||
VT, MMO);
|
||||
CSEMap.InsertNode(N, IP);
|
||||
InsertNode(N);
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
|
||||
SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
|
||||
SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) {
|
||||
assert(Chain.getValueType() == MVT::Other &&
|
||||
"Invalid chain type");
|
||||
EVT VT = Val.getValueType();
|
||||
SDVTList VTs = getVTList(MVT::Other);
|
||||
SDValue Ops[] = { Chain, Ptr, Mask, Val };
|
||||
FoldingSetNodeID ID;
|
||||
AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
|
||||
ID.AddInteger(VT.getRawBits());
|
||||
ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
|
||||
MMO->isNonTemporal(), MMO->isInvariant()));
|
||||
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
|
||||
void *IP = nullptr;
|
||||
if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
|
||||
cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
|
||||
return SDValue(E, 0);
|
||||
}
|
||||
SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
|
||||
dl.getDebugLoc(), Ops, 4,
|
||||
VTs, VT, MMO);
|
||||
CSEMap.InsertNode(N, IP);
|
||||
InsertNode(N);
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
|
||||
SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
|
||||
SDValue Chain, SDValue Ptr,
|
||||
SDValue SV,
|
||||
|
@ -3613,6 +3613,70 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
|
||||
DAG.setRoot(StoreNode);
|
||||
}
|
||||
|
||||
void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
|
||||
SDLoc sdl = getCurSDLoc();
|
||||
|
||||
Value *PtrOperand = I.getArgOperand(0);
|
||||
SDValue Ptr = getValue(PtrOperand);
|
||||
SDValue Src0 = getValue(I.getArgOperand(1));
|
||||
SDValue Mask = getValue(I.getArgOperand(3));
|
||||
EVT VT = Src0.getValueType();
|
||||
unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
|
||||
if (!Alignment)
|
||||
Alignment = DAG.getEVTAlignment(VT);
|
||||
|
||||
AAMDNodes AAInfo;
|
||||
I.getAAMetadata(AAInfo);
|
||||
|
||||
MachineMemOperand *MMO =
|
||||
DAG.getMachineFunction().
|
||||
getMachineMemOperand(MachinePointerInfo(PtrOperand),
|
||||
MachineMemOperand::MOStore, VT.getStoreSize(),
|
||||
Alignment, AAInfo);
|
||||
SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO);
|
||||
DAG.setRoot(StoreNode);
|
||||
setValue(&I, StoreNode);
|
||||
}
|
||||
|
||||
void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
|
||||
SDLoc sdl = getCurSDLoc();
|
||||
|
||||
Value *PtrOperand = I.getArgOperand(0);
|
||||
SDValue Ptr = getValue(PtrOperand);
|
||||
SDValue Src0 = getValue(I.getArgOperand(1));
|
||||
SDValue Mask = getValue(I.getArgOperand(3));
|
||||
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
EVT VT = TLI.getValueType(I.getType());
|
||||
unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
|
||||
if (!Alignment)
|
||||
Alignment = DAG.getEVTAlignment(VT);
|
||||
|
||||
AAMDNodes AAInfo;
|
||||
I.getAAMetadata(AAInfo);
|
||||
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
|
||||
|
||||
SDValue InChain = DAG.getRoot();
|
||||
if (AA->pointsToConstantMemory(
|
||||
AliasAnalysis::Location(PtrOperand,
|
||||
AA->getTypeStoreSize(I.getType()),
|
||||
AAInfo))) {
|
||||
// Do not serialize (non-volatile) loads of constant memory with anything.
|
||||
InChain = DAG.getEntryNode();
|
||||
}
|
||||
|
||||
MachineMemOperand *MMO =
|
||||
DAG.getMachineFunction().
|
||||
getMachineMemOperand(MachinePointerInfo(PtrOperand),
|
||||
MachineMemOperand::MOLoad, VT.getStoreSize(),
|
||||
Alignment, AAInfo, Ranges);
|
||||
|
||||
SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO);
|
||||
SDValue OutChain = Load.getValue(1);
|
||||
DAG.setRoot(OutChain);
|
||||
setValue(&I, Load);
|
||||
}
|
||||
|
||||
void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
|
||||
SDLoc dl = getCurSDLoc();
|
||||
AtomicOrdering SuccessOrder = I.getSuccessOrdering();
|
||||
@ -4914,6 +4978,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
case Intrinsic::masked_load:
|
||||
visitMaskedLoad(I);
|
||||
return nullptr;
|
||||
case Intrinsic::masked_store:
|
||||
visitMaskedStore(I);
|
||||
return nullptr;
|
||||
case Intrinsic::x86_mmx_pslli_w:
|
||||
case Intrinsic::x86_mmx_pslli_d:
|
||||
case Intrinsic::x86_mmx_pslli_q:
|
||||
|
@ -756,6 +756,8 @@ private:
|
||||
void visitAlloca(const AllocaInst &I);
|
||||
void visitLoad(const LoadInst &I);
|
||||
void visitStore(const StoreInst &I);
|
||||
void visitMaskedLoad(const CallInst &I);
|
||||
void visitMaskedStore(const CallInst &I);
|
||||
void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
|
||||
void visitAtomicRMW(const AtomicRMWInst &I);
|
||||
void visitFence(const FenceInst &I);
|
||||
|
@ -269,6 +269,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
|
||||
// Other operators
|
||||
case ISD::LOAD: return "load";
|
||||
case ISD::STORE: return "store";
|
||||
case ISD::MLOAD: return "masked_load";
|
||||
case ISD::MSTORE: return "masked_store";
|
||||
case ISD::VAARG: return "vaarg";
|
||||
case ISD::VACOPY: return "vacopy";
|
||||
case ISD::VAEND: return "vaend";
|
||||
|
@ -537,7 +537,8 @@ enum IIT_Info {
|
||||
IIT_ANYPTR = 26,
|
||||
IIT_V1 = 27,
|
||||
IIT_VARARG = 28,
|
||||
IIT_HALF_VEC_ARG = 29
|
||||
IIT_HALF_VEC_ARG = 29,
|
||||
IIT_SAME_VEC_WIDTH_ARG = 30
|
||||
};
|
||||
|
||||
|
||||
@ -645,6 +646,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
|
||||
ArgInfo));
|
||||
return;
|
||||
}
|
||||
case IIT_SAME_VEC_WIDTH_ARG: {
|
||||
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
|
||||
OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
|
||||
ArgInfo));
|
||||
return;
|
||||
}
|
||||
case IIT_EMPTYSTRUCT:
|
||||
OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
|
||||
return;
|
||||
@ -752,7 +759,14 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
|
||||
case IITDescriptor::HalfVecArgument:
|
||||
return VectorType::getHalfElementsVectorType(cast<VectorType>(
|
||||
Tys[D.getArgumentNumber()]));
|
||||
}
|
||||
case IITDescriptor::SameVecWidthArgument:
|
||||
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
|
||||
Type *Ty = Tys[D.getArgumentNumber()];
|
||||
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
|
||||
return VectorType::get(EltTy, VTy->getNumElements());
|
||||
}
|
||||
llvm_unreachable("unhandled");
|
||||
}
|
||||
llvm_unreachable("unhandled");
|
||||
}
|
||||
|
||||
|
@ -183,3 +183,29 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
|
||||
return createCallHelper(FnAssume, Ops, this);
|
||||
}
|
||||
|
||||
/// Create a call to a Masked Load intrinsic.
|
||||
/// Ops - an array of operands.
|
||||
CallInst *IRBuilderBase::CreateMaskedLoad(ArrayRef<Value *> Ops) {
|
||||
// The only one overloaded type - the type of passthru value in this case
|
||||
Type *DataTy = Ops[1]->getType();
|
||||
return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy);
|
||||
}
|
||||
|
||||
/// Create a call to a Masked Store intrinsic.
|
||||
/// Ops - an array of operands.
|
||||
CallInst *IRBuilderBase::CreateMaskedStore(ArrayRef<Value *> Ops) {
|
||||
// DataTy - type of the data to be stored - the only one overloaded type
|
||||
Type *DataTy = Ops[1]->getType();
|
||||
return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, DataTy);
|
||||
}
|
||||
|
||||
/// Create a call to a Masked intrinsic, with given intrinsic Id,
|
||||
/// an array of operands - Ops, and one overloaded type - DataTy
|
||||
CallInst *IRBuilderBase::CreateMaskedIntrinsic(unsigned Id,
|
||||
ArrayRef<Value *> Ops,
|
||||
Type *DataTy) {
|
||||
Module *M = BB->getParent()->getParent();
|
||||
Type *OverloadedTypes[] = { DataTy };
|
||||
Value *TheFn = Intrinsic::getDeclaration(M, (Intrinsic::ID)Id, OverloadedTypes);
|
||||
return createCallHelper(TheFn, Ops, this);
|
||||
}
|
||||
|
@ -2405,6 +2405,19 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
|
||||
!isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
|
||||
VectorType::getHalfElementsVectorType(
|
||||
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
|
||||
case IITDescriptor::SameVecWidthArgument: {
|
||||
if (D.getArgumentNumber() >= ArgTys.size())
|
||||
return true;
|
||||
VectorType * ReferenceType =
|
||||
dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
|
||||
VectorType *ThisArgType = dyn_cast<VectorType>(Ty);
|
||||
if (!ThisArgType || !ReferenceType ||
|
||||
(ReferenceType->getVectorNumElements() !=
|
||||
ThisArgType->getVectorNumElements()))
|
||||
return true;
|
||||
return VerifyIntrinsicType(ThisArgType->getVectorElementType(),
|
||||
Infos, ArgTys);
|
||||
}
|
||||
}
|
||||
llvm_unreachable("unhandled");
|
||||
}
|
||||
|
@ -1321,13 +1321,21 @@ void X86TargetLowering::resetOperationActions() {
|
||||
|
||||
// Extract subvector is special because the value type
|
||||
// (result) is 128-bit but the source is 256-bit wide.
|
||||
if (VT.is128BitVector())
|
||||
if (VT.is128BitVector()) {
|
||||
if (VT.getScalarSizeInBits() >= 32) {
|
||||
setOperationAction(ISD::MLOAD, VT, Custom);
|
||||
setOperationAction(ISD::MSTORE, VT, Custom);
|
||||
}
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
|
||||
|
||||
}
|
||||
// Do not attempt to custom lower other non-256-bit vectors
|
||||
if (!VT.is256BitVector())
|
||||
continue;
|
||||
|
||||
if (VT.getScalarSizeInBits() >= 32) {
|
||||
setOperationAction(ISD::MLOAD, VT, Legal);
|
||||
setOperationAction(ISD::MSTORE, VT, Legal);
|
||||
}
|
||||
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
|
||||
@ -1494,9 +1502,13 @@ void X86TargetLowering::resetOperationActions() {
|
||||
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
|
||||
// Extract subvector is special because the value type
|
||||
// (result) is 256/128-bit but the source is 512-bit wide.
|
||||
if (VT.is128BitVector() || VT.is256BitVector())
|
||||
if (VT.is128BitVector() || VT.is256BitVector()) {
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
|
||||
|
||||
if ( EltSize >= 32) {
|
||||
setOperationAction(ISD::MLOAD, VT, Legal);
|
||||
setOperationAction(ISD::MSTORE, VT, Legal);
|
||||
}
|
||||
}
|
||||
if (VT.getVectorElementType() == MVT::i1)
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
|
||||
|
||||
@ -1512,6 +1524,8 @@ void X86TargetLowering::resetOperationActions() {
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
|
||||
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
|
||||
setOperationAction(ISD::MLOAD, VT, Legal);
|
||||
setOperationAction(ISD::MSTORE, VT, Legal);
|
||||
}
|
||||
}
|
||||
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
|
||||
|
@ -2097,6 +2097,41 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
|
||||
(VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
|
||||
VR512:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
|
||||
(VMOVUPSZmrk addr:$ptr,
|
||||
(v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
|
||||
(INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
|
||||
(v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz
|
||||
(v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)),
|
||||
(VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)),
|
||||
(VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>;
|
||||
|
||||
def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)),
|
||||
(VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask,
|
||||
(bc_v16f32 (v16i32 immAllZerosV)))),
|
||||
(VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))),
|
||||
(VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)),
|
||||
(VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask,
|
||||
(bc_v8f64 (v16i32 immAllZerosV)))),
|
||||
(VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))),
|
||||
(VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>;
|
||||
|
||||
defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32",
|
||||
"16", "8", "4", SSEPackedInt, HasAVX512>,
|
||||
avx512_store_vl<0x7F, "vmovdqa32", "alignedstore",
|
||||
@ -2171,6 +2206,46 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
|
||||
(VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
|
||||
}
|
||||
|
||||
def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))),
|
||||
(VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)),
|
||||
(VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))),
|
||||
(VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask,
|
||||
(bc_v8i64 (v16i32 immAllZerosV)))),
|
||||
(VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)),
|
||||
(VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))),
|
||||
(VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)),
|
||||
(VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)),
|
||||
(VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>;
|
||||
|
||||
// SKX replacement
|
||||
def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
|
||||
(VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>;
|
||||
|
||||
// KNL replacement
|
||||
def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
|
||||
(VMOVDQU32Zmrk addr:$ptr,
|
||||
(v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
|
||||
(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
|
||||
|
||||
def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
|
||||
(v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz
|
||||
(v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
|
||||
|
||||
|
||||
// Move Int Doubleword to Packed Double Int
|
||||
//
|
||||
def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
|
||||
|
@ -9260,6 +9260,61 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
|
||||
int_x86_avx2_maskstore_q,
|
||||
int_x86_avx2_maskstore_q_256>, VEX_W;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
|
||||
(VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
|
||||
(VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
|
||||
(VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
|
||||
(bc_v8f32 (v8i32 immAllZerosV)))),
|
||||
(VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
|
||||
(VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
|
||||
(VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
|
||||
(VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
|
||||
(VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
|
||||
(VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
|
||||
(VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
|
||||
|
||||
def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
|
||||
(VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
|
||||
(v4f64 immAllZerosV))),
|
||||
(VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
|
||||
(VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
|
||||
(VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
|
||||
(bc_v4i64 (v8i32 immAllZerosV)))),
|
||||
(VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
|
||||
|
||||
def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
|
||||
(VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
|
||||
VR256:$mask)>;
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Variable Bit Shifts
|
||||
|
@ -111,6 +111,8 @@ public:
|
||||
Type *Ty) const override;
|
||||
unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
|
||||
Type *Ty) const override;
|
||||
bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const;
|
||||
bool isLegalPredicatedStore(Type *DataType, int Consecutive) const;
|
||||
|
||||
/// @}
|
||||
};
|
||||
@ -1156,3 +1158,19 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
|
||||
}
|
||||
return X86TTI::getIntImmCost(Imm, Ty);
|
||||
}
|
||||
|
||||
bool X86TTI::isLegalPredicatedLoad(Type *DataType, int Consecutive) const {
|
||||
int ScalarWidth = DataType->getScalarSizeInBits();
|
||||
|
||||
// Todo: AVX512 allows gather/scatter, works with strided and random as well
|
||||
if ((ScalarWidth < 32) || (Consecutive == 0))
|
||||
return false;
|
||||
if (ST->hasAVX512() || ST->hasAVX2())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool X86TTI::isLegalPredicatedStore(Type *DataType, int Consecutive) const {
|
||||
return isLegalPredicatedLoad(DataType, Consecutive);
|
||||
}
|
||||
|
||||
|
@ -580,9 +580,10 @@ public:
|
||||
|
||||
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
|
||||
DominatorTree *DT, TargetLibraryInfo *TLI,
|
||||
AliasAnalysis *AA, Function *F)
|
||||
AliasAnalysis *AA, Function *F,
|
||||
const TargetTransformInfo *TTI)
|
||||
: NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
|
||||
DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr),
|
||||
DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr),
|
||||
WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
|
||||
}
|
||||
|
||||
@ -768,6 +769,15 @@ public:
|
||||
}
|
||||
SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
|
||||
|
||||
bool canPredicateStore(Type *DataType, Value *Ptr) {
|
||||
return TTI->isLegalPredicatedStore(DataType, isConsecutivePtr(Ptr));
|
||||
}
|
||||
bool canPredicateLoad(Type *DataType, Value *Ptr) {
|
||||
return TTI->isLegalPredicatedLoad(DataType, isConsecutivePtr(Ptr));
|
||||
}
|
||||
bool setMaskedOp(const Instruction* I) {
|
||||
return (MaskedOp.find(I) != MaskedOp.end());
|
||||
}
|
||||
private:
|
||||
/// Check if a single basic block loop is vectorizable.
|
||||
/// At this point we know that this is a loop with a constant trip count
|
||||
@ -840,6 +850,8 @@ private:
|
||||
AliasAnalysis *AA;
|
||||
/// Parent function
|
||||
Function *TheFunction;
|
||||
/// Target Transform Info
|
||||
const TargetTransformInfo *TTI;
|
||||
|
||||
// --- vectorization state --- //
|
||||
|
||||
@ -871,6 +883,10 @@ private:
|
||||
|
||||
ValueToValueMap Strides;
|
||||
SmallPtrSet<Value *, 8> StrideSet;
|
||||
|
||||
/// While vectorizing these instructions we have to generate a
|
||||
/// call to an appropriate masked intrinsic
|
||||
std::set<const Instruction*> MaskedOp;
|
||||
};
|
||||
|
||||
/// LoopVectorizationCostModel - estimates the expected speedups due to
|
||||
@ -1375,7 +1391,7 @@ struct LoopVectorize : public FunctionPass {
|
||||
}
|
||||
|
||||
// Check if it is legal to vectorize the loop.
|
||||
LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F);
|
||||
LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI);
|
||||
if (!LVL.canVectorize()) {
|
||||
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
|
||||
emitMissedWarning(F, L, Hints);
|
||||
@ -1763,7 +1779,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
|
||||
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
|
||||
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
|
||||
|
||||
if (SI && Legal->blockNeedsPredication(SI->getParent()))
|
||||
if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
|
||||
!Legal->setMaskedOp(SI))
|
||||
return scalarizeInstruction(Instr, true);
|
||||
|
||||
if (ScalarAllocatedSize != VectorElementSize)
|
||||
@ -1857,8 +1874,25 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
|
||||
|
||||
Value *VecPtr = Builder.CreateBitCast(PartPtr,
|
||||
DataTy->getPointerTo(AddressSpace));
|
||||
StoreInst *NewSI =
|
||||
Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
|
||||
|
||||
Instruction *NewSI;
|
||||
if (Legal->setMaskedOp(SI)) {
|
||||
Type *I8PtrTy =
|
||||
Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace());
|
||||
|
||||
Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy);
|
||||
|
||||
VectorParts Cond = createEdgeMask(SI->getParent()->getSinglePredecessor(),
|
||||
SI->getParent());
|
||||
SmallVector <Value *, 8> Ops;
|
||||
Ops.push_back(I8Ptr);
|
||||
Ops.push_back(StoredVal[Part]);
|
||||
Ops.push_back(Builder.getInt32(Alignment));
|
||||
Ops.push_back(Cond[Part]);
|
||||
NewSI = Builder.CreateMaskedStore(Ops);
|
||||
}
|
||||
else
|
||||
NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
|
||||
propagateMetadata(NewSI, SI);
|
||||
}
|
||||
return;
|
||||
@ -1873,14 +1907,31 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
|
||||
|
||||
if (Reverse) {
|
||||
// If the address is consecutive but reversed, then the
|
||||
// wide store needs to start at the last vector element.
|
||||
// wide load needs to start at the last vector element.
|
||||
PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
|
||||
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
|
||||
}
|
||||
|
||||
Value *VecPtr = Builder.CreateBitCast(PartPtr,
|
||||
DataTy->getPointerTo(AddressSpace));
|
||||
LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
|
||||
Instruction* NewLI;
|
||||
if (Legal->setMaskedOp(LI)) {
|
||||
Type *I8PtrTy =
|
||||
Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace());
|
||||
|
||||
Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy);
|
||||
|
||||
VectorParts SrcMask = createBlockInMask(LI->getParent());
|
||||
SmallVector <Value *, 8> Ops;
|
||||
Ops.push_back(I8Ptr);
|
||||
Ops.push_back(UndefValue::get(DataTy));
|
||||
Ops.push_back(Builder.getInt32(Alignment));
|
||||
Ops.push_back(SrcMask[Part]);
|
||||
NewLI = Builder.CreateMaskedLoad(Ops);
|
||||
}
|
||||
else {
|
||||
Value *VecPtr = Builder.CreateBitCast(PartPtr,
|
||||
DataTy->getPointerTo(AddressSpace));
|
||||
NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
|
||||
}
|
||||
propagateMetadata(NewLI, LI);
|
||||
Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
|
||||
}
|
||||
@ -5304,8 +5355,15 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
|
||||
// We might be able to hoist the load.
|
||||
if (it->mayReadFromMemory()) {
|
||||
LoadInst *LI = dyn_cast<LoadInst>(it);
|
||||
if (!LI || !SafePtrs.count(LI->getPointerOperand()))
|
||||
if (!LI)
|
||||
return false;
|
||||
if (!SafePtrs.count(LI->getPointerOperand())) {
|
||||
if (canPredicateLoad(LI->getType(), LI->getPointerOperand())) {
|
||||
MaskedOp.insert(LI);
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't predicate stores at the moment.
|
||||
@ -5313,10 +5371,20 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
|
||||
StoreInst *SI = dyn_cast<StoreInst>(it);
|
||||
// We only support predication of stores in basic blocks with one
|
||||
// predecessor.
|
||||
if (!SI || ++NumPredStores > NumberOfStoresToPredicate ||
|
||||
!SafePtrs.count(SI->getPointerOperand()) ||
|
||||
!SI->getParent()->getSinglePredecessor())
|
||||
if (!SI)
|
||||
return false;
|
||||
|
||||
if (++NumPredStores > NumberOfStoresToPredicate ||
|
||||
!SafePtrs.count(SI->getPointerOperand()) ||
|
||||
!SI->getParent()->getSinglePredecessor()) {
|
||||
if (canPredicateStore(SI->getValueOperand()->getType(),
|
||||
SI->getPointerOperand())) {
|
||||
MaskedOp.insert(SI);
|
||||
--NumPredStores;
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (it->mayThrow())
|
||||
return false;
|
||||
@ -5380,7 +5448,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
|
||||
MaxVectorSize = 1;
|
||||
}
|
||||
|
||||
assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
|
||||
assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
|
||||
" into one vector!");
|
||||
|
||||
unsigned VF = MaxVectorSize;
|
||||
@ -5441,7 +5509,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
|
||||
// the vector elements.
|
||||
float VectorCost = expectedCost(i) / (float)i;
|
||||
DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
|
||||
(int)VectorCost << ".\n");
|
||||
VectorCost << ".\n");
|
||||
if (VectorCost < Cost) {
|
||||
Cost = VectorCost;
|
||||
Width = i;
|
||||
|
73
test/CodeGen/X86/masked_memop.ll
Normal file
73
test/CodeGen/X86/masked_memop.ll
Normal file
@ -0,0 +1,73 @@
|
||||
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
|
||||
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
|
||||
|
||||
; AVX512-LABEL: test1
|
||||
; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
|
||||
|
||||
; AVX2-LABEL: test1
|
||||
; AVX2: vpmaskmovd 32(%rdi)
|
||||
; AVX2: vpmaskmovd (%rdi)
|
||||
; AVX2-NOT: blend
|
||||
|
||||
define <16 x i32> @test1(<16 x i32> %trigger, i8* %addr) {
|
||||
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
|
||||
%res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>undef, i32 4, <16 x i1>%mask)
|
||||
ret <16 x i32> %res
|
||||
}
|
||||
|
||||
; AVX512-LABEL: test2
|
||||
; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
|
||||
|
||||
; AVX2-LABEL: test2
|
||||
; AVX2: vpmaskmovd {{.*}}(%rdi)
|
||||
; AVX2: vpmaskmovd {{.*}}(%rdi)
|
||||
; AVX2-NOT: blend
|
||||
define <16 x i32> @test2(<16 x i32> %trigger, i8* %addr) {
|
||||
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
|
||||
%res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>zeroinitializer, i32 4, <16 x i1>%mask)
|
||||
ret <16 x i32> %res
|
||||
}
|
||||
|
||||
; AVX512-LABEL: test3
|
||||
; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
|
||||
|
||||
define void @test3(<16 x i32> %trigger, i8* %addr, <16 x i32> %val) {
|
||||
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
|
||||
call void @llvm.masked.store.v16i32(i8* %addr, <16 x i32>%val, i32 4, <16 x i1>%mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; AVX512-LABEL: test4
|
||||
; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}}
|
||||
|
||||
; AVX2-LABEL: test4
|
||||
; AVX2: vpmaskmovd {{.*}}(%rdi)
|
||||
; AVX2: vpmaskmovd {{.*}}(%rdi)
|
||||
; AVX2: blend
|
||||
define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) {
|
||||
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
|
||||
%res = call <16 x float> @llvm.masked.load.v16f32(i8* %addr, <16 x float>%dst, i32 4, <16 x i1>%mask)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
; AVX512-LABEL: test5
|
||||
; AVX512: vmovupd (%rdi), %zmm1 {%k1}
|
||||
|
||||
; AVX2-LABEL: test5
|
||||
; AVX2: vpmaskmovq
|
||||
; AVX2: vblendvpd
|
||||
; AVX2: vpmaskmovq
|
||||
; AVX2: vblendvpd
|
||||
define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) {
|
||||
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
|
||||
%res = call <8 x double> @llvm.masked.load.v8f64(i8* %addr, <8 x double>%dst, i32 4, <8 x i1>%mask)
|
||||
ret <8 x double> %res
|
||||
}
|
||||
|
||||
declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
|
||||
declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
|
||||
declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>)
|
||||
declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>)
|
||||
declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>)
|
||||
declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>)
|
||||
|
83
test/Transforms/LoopVectorize/X86/mask1.ll
Normal file
83
test/Transforms/LoopVectorize/X86/mask1.ll
Normal file
@ -0,0 +1,83 @@
|
||||
; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1
|
||||
; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2
|
||||
; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-pc_linux"
|
||||
|
||||
; The source code:
|
||||
;
|
||||
;void foo(int *A, int *B, int *trigger) {
|
||||
;
|
||||
; for (int i=0; i<10000; i++) {
|
||||
; if (trigger[i] < 100) {
|
||||
; A[i] = B[i] + trigger[i];
|
||||
; }
|
||||
; }
|
||||
;}
|
||||
|
||||
|
||||
;AVX2: llvm.masked.load.v8i32
|
||||
;AVX2: llvm.masked.store.v8i32
|
||||
;AVX512: llvm.masked.load.v16i32
|
||||
;AVX512: llvm.masked.store.v16i32
|
||||
;AVX1-NOT: llvm.masked
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @foo(i32* %A, i32* %B, i32* %trigger) {
|
||||
entry:
|
||||
%A.addr = alloca i32*, align 8
|
||||
%B.addr = alloca i32*, align 8
|
||||
%trigger.addr = alloca i32*, align 8
|
||||
%i = alloca i32, align 4
|
||||
store i32* %A, i32** %A.addr, align 8
|
||||
store i32* %B, i32** %B.addr, align 8
|
||||
store i32* %trigger, i32** %trigger.addr, align 8
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32* %i, align 4
|
||||
%cmp = icmp slt i32 %0, 10000
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%1 = load i32* %i, align 4
|
||||
%idxprom = sext i32 %1 to i64
|
||||
%2 = load i32** %trigger.addr, align 8
|
||||
%arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
|
||||
%3 = load i32* %arrayidx, align 4
|
||||
%cmp1 = icmp slt i32 %3, 100
|
||||
br i1 %cmp1, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%4 = load i32* %i, align 4
|
||||
%idxprom2 = sext i32 %4 to i64
|
||||
%5 = load i32** %B.addr, align 8
|
||||
%arrayidx3 = getelementptr inbounds i32* %5, i64 %idxprom2
|
||||
%6 = load i32* %arrayidx3, align 4
|
||||
%7 = load i32* %i, align 4
|
||||
%idxprom4 = sext i32 %7 to i64
|
||||
%8 = load i32** %trigger.addr, align 8
|
||||
%arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
|
||||
%9 = load i32* %arrayidx5, align 4
|
||||
%add = add nsw i32 %6, %9
|
||||
%10 = load i32* %i, align 4
|
||||
%idxprom6 = sext i32 %10 to i64
|
||||
%11 = load i32** %A.addr, align 8
|
||||
%arrayidx7 = getelementptr inbounds i32* %11, i64 %idxprom6
|
||||
store i32 %add, i32* %arrayidx7, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %for.body
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%12 = load i32* %i, align 4
|
||||
%inc = add nsw i32 %12, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret void
|
||||
}
|
84
test/Transforms/LoopVectorize/X86/mask2.ll
Normal file
84
test/Transforms/LoopVectorize/X86/mask2.ll
Normal file
@ -0,0 +1,84 @@
|
||||
; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1
|
||||
; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2
|
||||
; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-pc_linux"
|
||||
|
||||
; The source code:
|
||||
;
|
||||
;void foo(float *A, float *B, int *trigger) {
|
||||
;
|
||||
; for (int i=0; i<10000; i++) {
|
||||
; if (trigger[i] < 100) {
|
||||
; A[i] = B[i] + trigger[i];
|
||||
; }
|
||||
; }
|
||||
;}
|
||||
|
||||
|
||||
;AVX2: llvm.masked.load.v8f32
|
||||
;AVX2: llvm.masked.store.v8f32
|
||||
;AVX512: llvm.masked.load.v16f32
|
||||
;AVX512: llvm.masked.store.v16f32
|
||||
;AVX1-NOT: llvm.masked
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @foo(float* %A, float* %B, i32* %trigger) {
|
||||
entry:
|
||||
%A.addr = alloca float*, align 8
|
||||
%B.addr = alloca float*, align 8
|
||||
%trigger.addr = alloca i32*, align 8
|
||||
%i = alloca i32, align 4
|
||||
store float* %A, float** %A.addr, align 8
|
||||
store float* %B, float** %B.addr, align 8
|
||||
store i32* %trigger, i32** %trigger.addr, align 8
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32* %i, align 4
|
||||
%cmp = icmp slt i32 %0, 10000
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%1 = load i32* %i, align 4
|
||||
%idxprom = sext i32 %1 to i64
|
||||
%2 = load i32** %trigger.addr, align 8
|
||||
%arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
|
||||
%3 = load i32* %arrayidx, align 4
|
||||
%cmp1 = icmp slt i32 %3, 100
|
||||
br i1 %cmp1, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%4 = load i32* %i, align 4
|
||||
%idxprom2 = sext i32 %4 to i64
|
||||
%5 = load float** %B.addr, align 8
|
||||
%arrayidx3 = getelementptr inbounds float* %5, i64 %idxprom2
|
||||
%6 = load float* %arrayidx3, align 4
|
||||
%7 = load i32* %i, align 4
|
||||
%idxprom4 = sext i32 %7 to i64
|
||||
%8 = load i32** %trigger.addr, align 8
|
||||
%arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
|
||||
%9 = load i32* %arrayidx5, align 4
|
||||
%conv = sitofp i32 %9 to float
|
||||
%add = fadd float %6, %conv
|
||||
%10 = load i32* %i, align 4
|
||||
%idxprom6 = sext i32 %10 to i64
|
||||
%11 = load float** %A.addr, align 8
|
||||
%arrayidx7 = getelementptr inbounds float* %11, i64 %idxprom6
|
||||
store float %add, float* %arrayidx7, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %for.body
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%12 = load i32* %i, align 4
|
||||
%inc = add nsw i32 %12, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret void
|
||||
}
|
84
test/Transforms/LoopVectorize/X86/mask3.ll
Normal file
84
test/Transforms/LoopVectorize/X86/mask3.ll
Normal file
@ -0,0 +1,84 @@
|
||||
; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1
|
||||
; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2
|
||||
; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-pc_linux"
|
||||
|
||||
; The source code:
|
||||
;
|
||||
;void foo(double *A, double *B, int *trigger) {
|
||||
;
|
||||
; for (int i=0; i<10000; i++) {
|
||||
; if (trigger[i] < 100) {
|
||||
; A[i] = B[i] + trigger[i];
|
||||
; }
|
||||
; }
|
||||
;}
|
||||
|
||||
|
||||
;AVX2: llvm.masked.load.v4f64
|
||||
;AVX2: llvm.masked.store.v4f64
|
||||
;AVX512: llvm.masked.load.v8f64
|
||||
;AVX512: llvm.masked.store.v8f64
|
||||
;AVX1-NOT: llvm.masked
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @foo(double* %A, double* %B, i32* %trigger) #0 {
|
||||
entry:
|
||||
%A.addr = alloca double*, align 8
|
||||
%B.addr = alloca double*, align 8
|
||||
%trigger.addr = alloca i32*, align 8
|
||||
%i = alloca i32, align 4
|
||||
store double* %A, double** %A.addr, align 8
|
||||
store double* %B, double** %B.addr, align 8
|
||||
store i32* %trigger, i32** %trigger.addr, align 8
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32* %i, align 4
|
||||
%cmp = icmp slt i32 %0, 10000
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%1 = load i32* %i, align 4
|
||||
%idxprom = sext i32 %1 to i64
|
||||
%2 = load i32** %trigger.addr, align 8
|
||||
%arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
|
||||
%3 = load i32* %arrayidx, align 4
|
||||
%cmp1 = icmp slt i32 %3, 100
|
||||
br i1 %cmp1, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%4 = load i32* %i, align 4
|
||||
%idxprom2 = sext i32 %4 to i64
|
||||
%5 = load double** %B.addr, align 8
|
||||
%arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
|
||||
%6 = load double* %arrayidx3, align 8
|
||||
%7 = load i32* %i, align 4
|
||||
%idxprom4 = sext i32 %7 to i64
|
||||
%8 = load i32** %trigger.addr, align 8
|
||||
%arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
|
||||
%9 = load i32* %arrayidx5, align 4
|
||||
%conv = sitofp i32 %9 to double
|
||||
%add = fadd double %6, %conv
|
||||
%10 = load i32* %i, align 4
|
||||
%idxprom6 = sext i32 %10 to i64
|
||||
%11 = load double** %A.addr, align 8
|
||||
%arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6
|
||||
store double %add, double* %arrayidx7, align 8
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %for.body
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%12 = load i32* %i, align 4
|
||||
%inc = add nsw i32 %12, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret void
|
||||
}
|
83
test/Transforms/LoopVectorize/X86/mask4.ll
Normal file
83
test/Transforms/LoopVectorize/X86/mask4.ll
Normal file
@ -0,0 +1,83 @@
|
||||
; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1
|
||||
; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2
|
||||
; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-pc_linux"
|
||||
|
||||
; The source code:
|
||||
;
|
||||
;void foo(double *A, double *B, int *trigger) {
|
||||
;
|
||||
; for (int i=0; i<10000; i++) {
|
||||
; if (trigger[i] < 100) {
|
||||
; A[i] = B[i*2] + trigger[i]; << non-cosecutive access
|
||||
; }
|
||||
; }
|
||||
;}
|
||||
|
||||
|
||||
;AVX2-NOT: llvm.masked
|
||||
;AVX512-NOT: llvm.masked
|
||||
;AVX1-NOT: llvm.masked
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @foo(double* %A, double* %B, i32* %trigger) {
|
||||
entry:
|
||||
%A.addr = alloca double*, align 8
|
||||
%B.addr = alloca double*, align 8
|
||||
%trigger.addr = alloca i32*, align 8
|
||||
%i = alloca i32, align 4
|
||||
store double* %A, double** %A.addr, align 8
|
||||
store double* %B, double** %B.addr, align 8
|
||||
store i32* %trigger, i32** %trigger.addr, align 8
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32* %i, align 4
|
||||
%cmp = icmp slt i32 %0, 10000
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%1 = load i32* %i, align 4
|
||||
%idxprom = sext i32 %1 to i64
|
||||
%2 = load i32** %trigger.addr, align 8
|
||||
%arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
|
||||
%3 = load i32* %arrayidx, align 4
|
||||
%cmp1 = icmp slt i32 %3, 100
|
||||
br i1 %cmp1, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%4 = load i32* %i, align 4
|
||||
%mul = mul nsw i32 %4, 2
|
||||
%idxprom2 = sext i32 %mul to i64
|
||||
%5 = load double** %B.addr, align 8
|
||||
%arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
|
||||
%6 = load double* %arrayidx3, align 8
|
||||
%7 = load i32* %i, align 4
|
||||
%idxprom4 = sext i32 %7 to i64
|
||||
%8 = load i32** %trigger.addr, align 8
|
||||
%arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
|
||||
%9 = load i32* %arrayidx5, align 4
|
||||
%conv = sitofp i32 %9 to double
|
||||
%add = fadd double %6, %conv
|
||||
%10 = load i32* %i, align 4
|
||||
%idxprom6 = sext i32 %10 to i64
|
||||
%11 = load double** %A.addr, align 8
|
||||
%arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6
|
||||
store double %add, double* %arrayidx7, align 8
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %for.body
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%12 = load i32* %i, align 4
|
||||
%inc = add nsw i32 %12, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret void
|
||||
}
|
@ -539,7 +539,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
|
||||
// variants with iAny types; otherwise, if the intrinsic is not
|
||||
// overloaded, all the types can be specified directly.
|
||||
assert(((!TyEl->isSubClassOf("LLVMExtendedType") &&
|
||||
!TyEl->isSubClassOf("LLVMTruncatedType")) ||
|
||||
!TyEl->isSubClassOf("LLVMTruncatedType") &&
|
||||
!TyEl->isSubClassOf("LLVMVectorSameWidth")) ||
|
||||
VT == MVT::iAny || VT == MVT::vAny) &&
|
||||
"Expected iAny or vAny type");
|
||||
} else
|
||||
|
@ -257,7 +257,8 @@ enum IIT_Info {
|
||||
IIT_ANYPTR = 26,
|
||||
IIT_V1 = 27,
|
||||
IIT_VARARG = 28,
|
||||
IIT_HALF_VEC_ARG = 29
|
||||
IIT_HALF_VEC_ARG = 29,
|
||||
IIT_SAME_VEC_WIDTH_ARG = 30
|
||||
};
|
||||
|
||||
|
||||
@ -305,6 +306,13 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
|
||||
Sig.push_back(IIT_TRUNC_ARG);
|
||||
else if (R->isSubClassOf("LLVMHalfElementsVectorType"))
|
||||
Sig.push_back(IIT_HALF_VEC_ARG);
|
||||
else if (R->isSubClassOf("LLVMVectorSameWidth")) {
|
||||
Sig.push_back(IIT_SAME_VEC_WIDTH_ARG);
|
||||
Sig.push_back((Number << 2) | ArgCodes[Number]);
|
||||
MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy"));
|
||||
EncodeFixedValueType(VT, Sig);
|
||||
return;
|
||||
}
|
||||
else
|
||||
Sig.push_back(IIT_ARG);
|
||||
return Sig.push_back((Number << 2) | ArgCodes[Number]);
|
||||
|
Loading…
x
Reference in New Issue
Block a user