Introduce experimental generic intrinsics for horizontal vector reductions.

- This change allows targets to opt-in to using them instead of the log2
  shufflevector algorithm.
- The SLP and Loop vectorizers have the common code to do shuffle reductions
  factored out into LoopUtils, and now have a unified interface for generating
  reductions regardless of the preference of the target. LoopUtils now uses TTI
  to determine what kind of reductions the target wants to handle.
- For CodeGen, basic legalization support is added.

Differential Revision: https://reviews.llvm.org/D30086



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@302514 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Amara Emerson 2017-05-09 10:43:25 +00:00
parent 64dbd2ef25
commit 8f1f7ce9d1
19 changed files with 960 additions and 66 deletions

View File

@ -11687,6 +11687,338 @@ Examples:
%r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
Experimental Vector Reduction Intrinsics
----------------------------------------
Horizontal reductions of vectors can be expressed using the following
intrinsics. Each one takes a vector operand as an input and applies its
respective operation across all elements of the vector, returning a single
scalar result of the same element type.
'``llvm.experimental.vector.reduce.add.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a)
declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD``
reduction of a vector, returning the result as a scalar. The return type matches
the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a)
declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating point
``ADD`` reduction of a vector, returning the result as a scalar. The return type
matches the element-type of the vector input.
If the intrinsic call has fast-math flags, then the reduction will not preserve
the associativity of an equivalent scalarized counterpart. If it does not have
fast-math flags, then the reduction will be *ordered*, implying that the
operation respects the associativity of a scalarized reduction.
Arguments:
""""""""""
The first argument to this intrinsic is a scalar accumulator value, which is
only used when there are no fast-math flags attached. This argument may be undef
when fast-math flags are used.
The second argument must be a vector of floating point values.
Examples:
"""""""""
.. code-block:: llvm
%fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
%ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
reduction of a vector, returning the result as a scalar. The return type matches
the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a)
declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating point
``MUL`` reduction of a vector, returning the result as a scalar. The return type
matches the element-type of the vector input.
If the intrinsic call has fast-math flags, then the reduction will not preserve
the associativity of an equivalent scalarized counterpart. If it does not have
fast-math flags, then the reduction will be *ordered*, implying that the
operation respects the associativity of a scalarized reduction.
Arguments:
""""""""""
The first argument to this intrinsic is a scalar accumulator value, which is
only used when there are no fast-math flags attached. This argument may be undef
when fast-math flags are used.
The second argument must be a vector of floating point values.
Examples:
"""""""""
.. code-block:: llvm
%fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
%ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
'``llvm.experimental.vector.reduce.and.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
reduction of a vector, returning the result as a scalar. The return type matches
the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.or.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
of a vector, returning the result as a scalar. The return type matches the
element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.xor.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
reduction of a vector, returning the result as a scalar. The return type matches
the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.smax.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer
``MAX`` reduction of a vector, returning the result as a scalar. The return type
matches the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.smin.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer
``MIN`` reduction of a vector, returning the result as a scalar. The return type
matches the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.umax.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned
integer ``MAX`` reduction of a vector, returning the result as a scalar. The
return type matches the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.umin.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned
integer ``MIN`` reduction of a vector, returning the result as a scalar. The
return type matches the element-type of the vector input.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of integer values.
'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a)
declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating point
``MAX`` reduction of a vector, returning the result as a scalar. The return type
matches the element-type of the vector input.
If the intrinsic call has the ``nnan`` fast-math flag then the operation can
assume that NaNs are not present in the input vector.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of floating point values.
'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
::
declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %a)
declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %a)
Overview:
"""""""""
The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating point
``MIN`` reduction of a vector, returning the result as a scalar. The return type
matches the element-type of the vector input.
If the intrinsic call has the ``nnan`` fast-math flag then the operation can
assume that NaNs are not present in the input vector.
Arguments:
""""""""""
The argument to this intrinsic must be a vector of floating point values.
Half Precision Floating Point Intrinsics
----------------------------------------

View File

@ -740,6 +740,19 @@ public:
unsigned ChainSizeInBytes,
VectorType *VecTy) const;
/// Flags describing the kind of vector reduction.
struct ReductionFlags {
ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation.
bool IsSigned; ///< Whether the operation is a signed int reduction.
bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present.
};
/// \returns True if the target wants to handle the given reduction idiom in
/// the intrinsics form instead of the shuffle form.
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const;
/// @}
private:
@ -895,6 +908,8 @@ public:
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const = 0;
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
};
template <typename T>
@ -1200,6 +1215,10 @@ public:
VectorType *VecTy) const override {
return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const override {
return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
}
};
template <typename T>

View File

@ -456,6 +456,12 @@ public:
VectorType *VecTy) const {
return VF;
}
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return false;
}
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.

View File

@ -783,6 +783,20 @@ namespace ISD {
/// known nonzero constant. The only operand here is the chain.
GET_DYNAMIC_AREA_OFFSET,
/// Generic reduction nodes. These nodes represent horizontal vector
/// reduction operations, producing a scalar result.
/// The STRICT variants perform reductions in sequential order. The first
/// operand is an initial scalar accumulator value, and the second operand
/// is the vector to reduce.
VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL,
/// These reductions are non-strict, and have a single vector operand.
VECREDUCE_FADD, VECREDUCE_FMUL,
VECREDUCE_ADD, VECREDUCE_MUL,
VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR,
VECREDUCE_SMAX, VECREDUCE_SMIN, VECREDUCE_UMAX, VECREDUCE_UMIN,
/// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
VECREDUCE_FMAX, VECREDUCE_FMIN,
/// BUILTIN_OP_END - This must be the last enum value in this list.
/// The target-specific pre-isel opcode values start here.
BUILTIN_OP_END

View File

@ -454,6 +454,45 @@ public:
MDNode *ScopeTag = nullptr,
MDNode *NoAliasTag = nullptr);
/// \brief Create a vector fadd reduction intrinsic of the source vector.
/// The first parameter is a scalar accumulator value for ordered reductions.
CallInst *CreateFAddReduce(Value *Acc, Value *Src);
/// \brief Create a vector fmul reduction intrinsic of the source vector.
/// The first parameter is a scalar accumulator value for ordered reductions.
CallInst *CreateFMulReduce(Value *Acc, Value *Src);
/// \brief Create a vector int add reduction intrinsic of the source vector.
CallInst *CreateAddReduce(Value *Src);
/// \brief Create a vector int mul reduction intrinsic of the source vector.
CallInst *CreateMulReduce(Value *Src);
/// \brief Create a vector int AND reduction intrinsic of the source vector.
CallInst *CreateAndReduce(Value *Src);
/// \brief Create a vector int OR reduction intrinsic of the source vector.
CallInst *CreateOrReduce(Value *Src);
/// \brief Create a vector int XOR reduction intrinsic of the source vector.
CallInst *CreateXorReduce(Value *Src);
/// \brief Create a vector integer max reduction intrinsic of the source
/// vector.
CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false);
/// \brief Create a vector integer min reduction intrinsic of the source
/// vector.
CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false);
/// \brief Create a vector float max reduction intrinsic of the source
/// vector.
CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false);
/// \brief Create a vector float min reduction intrinsic of the source
/// vector.
CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false);
/// \brief Create a lifetime.start intrinsic.
///
/// If the pointer isn't i8* it will be converted.

View File

@ -812,6 +812,50 @@ def int_memcpy_element_atomic : Intrinsic<[],
[IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
WriteOnly<0>, ReadOnly<1>]>;
//===------------------------ Reduction Intrinsics ------------------------===//
//
def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
[llvm_anyfloat_ty,
llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
[llvm_anyfloat_ty,
llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_mul : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_and : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_or : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_xor : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_smax : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_smin : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_umax : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_umin : Intrinsic<[llvm_anyint_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_fmax : Intrinsic<[llvm_anyfloat_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
//===----- Intrinsics that are used to provide predicate information -----===//
def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

View File

@ -21,6 +21,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
@ -42,6 +43,7 @@ class PredIteratorCache;
class ScalarEvolution;
class SCEV;
class TargetLibraryInfo;
class TargetTransformInfo;
/// \brief Captures loop safety information.
/// It keep information for loop & its header may throw exception.
@ -489,6 +491,30 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE = nullptr);
/// Create a target reduction of the given vector. The reduction operation
/// is described by the \p Opcode parameter. min/max reductions require
/// additional information supplied in \p Flags.
/// The target is queried to determine if intrinsics or shuffle sequences are
/// required to implement the reduction.
Value *
createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
unsigned Opcode, Value *Src,
TargetTransformInfo::ReductionFlags Flags =
TargetTransformInfo::ReductionFlags(),
ArrayRef<Value *> RedOps = ArrayRef<Value *>());
/// Create a generic target reduction using a recurrence descriptor \p Desc
/// The target is queried to determine if intrinsics or shuffle sequences are
/// required to implement the reduction.
Value *createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
RecurrenceDescriptor &Desc, Value *Src,
bool NoNaN = false);
/// Get the intersection (logical and) of all of the potential IR flags
/// of each scalar operation (VL) that will be converted into a vector (I).
/// Flag set: NSW, NUW, exact, and all of fast-math.
void propagateIRFlags(Value *I, ArrayRef<Value *> VL);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H

View File

@ -500,6 +500,12 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
Type *Ty, ReductionFlags Flags) const {
return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
}
TargetTransformInfo::Concept::~Concept() {}
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}

View File

@ -23,6 +23,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
using namespace llvm;
using namespace llvm::PatternMatch;

View File

@ -675,6 +675,7 @@ private:
// Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
bool SplitVectorOperand(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_UnaryOp(SDNode *N);
SDValue SplitVecOp_TruncateHelper(SDNode *N);

View File

@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = SplitVecOp_ExtVecInRegOp(N);
break;
case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_MUL:
case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
Res = SplitVecOp_VECREDUCE(N, OpNo);
break;
}
}
@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
}
SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
EVT ResVT = N->getValueType(0);
SDValue Lo, Hi;
SDLoc dl(N);
SDValue VecOp = N->getOperand(OpNo);
EVT VecVT = VecOp.getValueType();
assert(VecVT.isVector() && "Can only split reduce vector operand");
GetSplitVector(VecOp, Lo, Hi);
EVT LoOpVT, HiOpVT;
std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
bool NoNaN = N->getFlags().hasNoNaNs();
unsigned CombineOpc = 0;
switch (N->getOpcode()) {
case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break;
case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break;
case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break;
case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break;
case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break;
case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
case ISD::VECREDUCE_FMAX:
CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
break;
case ISD::VECREDUCE_FMIN:
CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
break;
default:
llvm_unreachable("Unexpected reduce ISD node");
}
// Use the appropriate scalar instruction on the split subvectors before
// reducing the now partially reduced smaller vector.
SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
}
SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
// The result has a legal vector type, but the input needs splitting.
EVT ResVT = N->getValueType(0);

View File

@ -5970,7 +5970,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
unsigned NumOps = Ops.size();
switch (NumOps) {
case 0: return getNode(Opcode, DL, VT);
case 1: return getNode(Opcode, DL, VT, Ops[0]);
case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
default: break;

View File

@ -5737,6 +5737,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
case Intrinsic::experimental_deoptimize:
LowerDeoptimizeCall(&I);
return nullptr;
case Intrinsic::experimental_vector_reduce_fadd:
case Intrinsic::experimental_vector_reduce_fmul:
case Intrinsic::experimental_vector_reduce_add:
case Intrinsic::experimental_vector_reduce_mul:
case Intrinsic::experimental_vector_reduce_and:
case Intrinsic::experimental_vector_reduce_or:
case Intrinsic::experimental_vector_reduce_xor:
case Intrinsic::experimental_vector_reduce_smax:
case Intrinsic::experimental_vector_reduce_smin:
case Intrinsic::experimental_vector_reduce_umax:
case Intrinsic::experimental_vector_reduce_umin:
case Intrinsic::experimental_vector_reduce_fmax:
case Intrinsic::experimental_vector_reduce_fmin: {
visitVectorReduce(I, Intrinsic);
return nullptr;
}
}
}
@ -7616,6 +7634,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
FuncInfo.MF->getFrameInfo().setHasPatchPoint();
}
void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
unsigned Intrinsic) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2;
if (I.getNumArgOperands() > 1)
Op2 = getValue(I.getArgOperand(1));
SDLoc dl = getCurSDLoc();
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
SDValue Res;
FastMathFlags FMF;
if (isa<FPMathOperator>(I))
FMF = I.getFastMathFlags();
SDNodeFlags SDFlags;
SDFlags.setNoNaNs(FMF.noNaNs());
switch (Intrinsic) {
case Intrinsic::experimental_vector_reduce_fadd:
if (FMF.unsafeAlgebra())
Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
else
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
break;
case Intrinsic::experimental_vector_reduce_fmul:
if (FMF.unsafeAlgebra())
Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
else
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
break;
case Intrinsic::experimental_vector_reduce_add:
Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_mul:
Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_and:
Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_or:
Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_xor:
Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_smax:
Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_smin:
Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_umax:
Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_umin:
Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_fmax: {
Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
break;
}
case Intrinsic::experimental_vector_reduce_fmin: {
Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
break;
}
default:
llvm_unreachable("Unhandled vector reduce intrinsic");
}
setValue(&I, Res);
}
/// Returns an AttributeList representing the attributes applied to the return
/// value of the given call.
static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {

View File

@ -909,6 +909,8 @@ private:
void visitGCRelocate(const GCRelocateInst &I);
void visitGCResult(const GCResultInst &I);
void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
void visitUserOp1(const Instruction &I) {
llvm_unreachable("UserOp1 should not exist at instruction selection time!");
}

View File

@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::SETFALSE: return "setfalse";
case ISD::SETFALSE2: return "setfalse2";
}
case ISD::VECREDUCE_FADD: return "vecreduce_fadd";
case ISD::VECREDUCE_FMUL: return "vecreduce_fmul";
case ISD::VECREDUCE_ADD: return "vecreduce_add";
case ISD::VECREDUCE_MUL: return "vecreduce_mul";
case ISD::VECREDUCE_AND: return "vecreduce_and";
case ISD::VECREDUCE_OR: return "vecreduce_or";
case ISD::VECREDUCE_XOR: return "vecreduce_xor";
case ISD::VECREDUCE_SMAX: return "vecreduce_smax";
case ISD::VECREDUCE_SMIN: return "vecreduce_smin";
case ISD::VECREDUCE_UMAX: return "vecreduce_umax";
case ISD::VECREDUCE_UMIN: return "vecreduce_umin";
case ISD::VECREDUCE_FMAX: return "vecreduce_fmax";
case ISD::VECREDUCE_FMIN: return "vecreduce_fmin";
}
}

View File

@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
return CI;
}
static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
Value *Src) {
Module *M = Builder->GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Src};
Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() };
auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
return createCallHelper(Decl, Ops, Builder);
}
CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
Src->getType()};
auto Decl = Intrinsic::getDeclaration(
M, Intrinsic::experimental_vector_reduce_fadd, Tys);
return createCallHelper(Decl, Ops, this);
}
CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
Module *M = GetInsertBlock()->getParent()->getParent();
Value *Ops[] = {Acc, Src};
Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
Src->getType()};
auto Decl = Intrinsic::getDeclaration(
M, Intrinsic::experimental_vector_reduce_fmul, Tys);
return createCallHelper(Decl, Ops, this);
}
CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
Src);
}
CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
Src);
}
CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
Src);
}
CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
Src);
}
CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
Src);
}
CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
: Intrinsic::experimental_vector_reduce_umax;
return getReductionIntrinsic(this, ID, Src);
}
CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
: Intrinsic::experimental_vector_reduce_umin;
return getReductionIntrinsic(this, ID, Src);
}
CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
auto Rdx = getReductionIntrinsic(
this, Intrinsic::experimental_vector_reduce_fmax, Src);
if (NoNaN) {
FastMathFlags FMF;
FMF.setNoNaNs();
Rdx->setFastMathFlags(FMF);
}
return Rdx;
}
CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
auto Rdx = getReductionIntrinsic(
this, Intrinsic::experimental_vector_reduce_fmin, Src);
if (NoNaN) {
FastMathFlags FMF;
FMF.setNoNaNs();
Rdx->setFastMathFlags(FMF);
}
return Rdx;
}
CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.start only applies to pointers.");

View File

@ -18,6 +18,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
@ -1112,3 +1113,204 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
else
return (FalseVal + (TrueVal / 2)) / TrueVal;
}
/// \brief Adds a 'fast' flag to floating point operations.
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;
Flags.setUnsafeAlgebra();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
}
// Helper to generate a log2 shuffle reduction.
static Value *
getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
RecurrenceDescriptor::MRK_Invalid,
ArrayRef<Value *> RedOps = ArrayRef<Value *>()) {
unsigned VF = Src->getType()->getVectorNumElements();
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
// and vector ops, reducing the set of values being computed by half each
// round.
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
Value *TmpVec = Src;
SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
for (unsigned i = VF; i != 1; i >>= 1) {
// Move the upper half of the vector to the lower half.
for (unsigned j = 0; j != i / 2; ++j)
ShuffleMask[j] = Builder.getInt32(i / 2 + j);
// Fill the rest of the mask with undef.
std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
UndefValue::get(Builder.getInt32Ty()));
Value *Shuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()),
ConstantVector::get(ShuffleMask), "rdx.shuf");
if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
// Floating point operations had to be 'fast' to enable the reduction.
TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
TmpVec, Shuf, "bin.rdx"));
} else {
assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
"Invalid min/max");
TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
Shuf);
}
if (!RedOps.empty())
propagateIRFlags(TmpVec, RedOps);
}
// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}
/// Create a simple vector reduction specified by an opcode and some
/// flags (if generating min/max reductions).
Value *llvm::createSimpleTargetReduction(
IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
Value *Src, TargetTransformInfo::ReductionFlags Flags,
ArrayRef<Value *> RedOps) {
assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
std::function<Value*()> BuildFunc;
using RD = RecurrenceDescriptor;
RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
// TODO: Support creating ordered reductions.
FastMathFlags FMFUnsafe;
FMFUnsafe.setUnsafeAlgebra();
switch (Opcode) {
case Instruction::Add:
BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
break;
case Instruction::Mul:
BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
break;
case Instruction::And:
BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
break;
case Instruction::Or:
BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
break;
case Instruction::Xor:
BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
break;
case Instruction::FAdd:
BuildFunc = [&]() {
auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
return Rdx;
};
break;
case Instruction::FMul:
BuildFunc = [&]() {
auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
return Rdx;
};
break;
case Instruction::ICmp:
if (Flags.IsMaxOp) {
MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
BuildFunc = [&]() {
return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
};
} else {
MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
BuildFunc = [&]() {
return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
};
}
break;
case Instruction::FCmp:
if (Flags.IsMaxOp) {
MinMaxKind = RD::MRK_FloatMax;
BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
} else {
MinMaxKind = RD::MRK_FloatMin;
BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
}
break;
default:
llvm_unreachable("Unhandled opcode");
break;
}
if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
return BuildFunc();
return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
}
/// Create a vector reduction using a given recurrence descriptor.
Value *llvm::createTargetReduction(IRBuilder<> &Builder,
const TargetTransformInfo *TTI,
RecurrenceDescriptor &Desc, Value *Src,
bool NoNaN) {
// TODO: Support in-order reductions based on the recurrence descriptor.
RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
TargetTransformInfo::ReductionFlags Flags;
Flags.NoNaN = NoNaN;
auto getSimpleRdx = [&](unsigned Opc) {
return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
};
switch (RecKind) {
case RecurrenceDescriptor::RK_FloatAdd:
return getSimpleRdx(Instruction::FAdd);
case RecurrenceDescriptor::RK_FloatMult:
return getSimpleRdx(Instruction::FMul);
case RecurrenceDescriptor::RK_IntegerAdd:
return getSimpleRdx(Instruction::Add);
case RecurrenceDescriptor::RK_IntegerMult:
return getSimpleRdx(Instruction::Mul);
case RecurrenceDescriptor::RK_IntegerAnd:
return getSimpleRdx(Instruction::And);
case RecurrenceDescriptor::RK_IntegerOr:
return getSimpleRdx(Instruction::Or);
case RecurrenceDescriptor::RK_IntegerXor:
return getSimpleRdx(Instruction::Xor);
case RecurrenceDescriptor::RK_IntegerMinMax: {
switch (Desc.getMinMaxRecurrenceKind()) {
case RecurrenceDescriptor::MRK_SIntMax:
Flags.IsSigned = true;
Flags.IsMaxOp = true;
break;
case RecurrenceDescriptor::MRK_UIntMax:
Flags.IsMaxOp = true;
break;
case RecurrenceDescriptor::MRK_SIntMin:
Flags.IsSigned = true;
break;
case RecurrenceDescriptor::MRK_UIntMin:
break;
default:
llvm_unreachable("Unhandled MRK");
}
return getSimpleRdx(Instruction::ICmp);
}
case RecurrenceDescriptor::RK_FloatMinMax: {
Flags.IsMaxOp =
Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
return getSimpleRdx(Instruction::FCmp);
}
default:
llvm_unreachable("Unhandled RecKind");
}
}
void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
if (auto *VecOp = dyn_cast<Instruction>(I)) {
if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
// VecOVp is initialized to the 0th scalar, so start counting from index
// '1'.
VecOp->copyIRFlags(I0);
for (int i = 1, e = VL.size(); i < e; ++i) {
if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
VecOp->andIRFlags(Scalar);
}
}
}
}

View File

@ -1700,6 +1700,9 @@ public:
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
// Returns true if the NoNaN attribute is set on the function.
bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@ -4258,39 +4261,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
if (VF > 1) {
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
// and vector ops, reducing the set of values being computed by half each
// round.
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
Value *TmpVec = ReducedPartRdx;
SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
for (unsigned i = VF; i != 1; i >>= 1) {
// Move the upper half of the vector to the lower half.
for (unsigned j = 0; j != i / 2; ++j)
ShuffleMask[j] = Builder.getInt32(i / 2 + j);
// Fill the rest of the mask with undef.
std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
UndefValue::get(Builder.getInt32Ty()));
Value *Shuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()),
ConstantVector::get(ShuffleMask), "rdx.shuf");
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
// Floating point operations had to be 'fast' to enable the reduction.
TmpVec = addFastMathFlag(Builder.CreateBinOp(
(Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
else
TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
TmpVec, Shuf);
}
// The result is in the first element of the vector.
bool NoNaN = Legal->hasFunNoNaNAttr();
ReducedPartRdx =
Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())

View File

@ -41,6 +41,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
@ -212,23 +213,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
return Opcode;
}
/// Get the intersection (logical and) of all of the potential IR flags
/// of each scalar operation (VL) that will be converted into a vector (I).
/// Flag set: NSW, NUW, exact, and all of fast-math.
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
if (auto *VecOp = dyn_cast<Instruction>(I)) {
if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
// VecOVp is initialized to the 0th scalar, so start counting from index
// '1'.
VecOp->copyIRFlags(I0);
for (int i = 1, e = VL.size(); i < e; ++i) {
if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
VecOp->andIRFlags(Scalar);
}
}
}
}
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
@ -4513,7 +4497,7 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@ -4583,33 +4567,31 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
unsigned ReduxWidth, ArrayRef<Value *> RedOps,
const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
if (!IsPairwiseReduction)
return createSimpleTargetReduction(
Builder, TTI, ReductionOpcode, VectorizedValue,
TargetTransformInfo::ReductionFlags(), RedOps);
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
if (IsPairwiseReduction) {
Value *LeftMask =
Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
Value *RightMask =
Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
Value *LeftShuf = Builder.CreateShuffleVector(
Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
Value *RightShuf = Builder.CreateShuffleVector(
Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
"bin.rdx");
} else {
Value *UpperHalf =
createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
Value *Shuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
}
TmpVec =
Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
propagateIRFlags(TmpVec, RedOps);
}