mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-28 22:00:30 +00:00
Change semantics of fadd/fmul vector reductions.
This patch changes how LLVM handles the accumulator/start value in the reduction, by never ignoring it regardless of the presence of fast-math flags on callsites. This change introduces the following new intrinsics to replace the existing ones: llvm.experimental.vector.reduce.fadd -> llvm.experimental.vector.reduce.v2.fadd llvm.experimental.vector.reduce.fmul -> llvm.experimental.vector.reduce.v2.fmul and adds functionality to auto-upgrade existing LLVM IR and bitcode. Reviewers: RKSimon, greened, dmgreen, nikic, simoll, aemerson Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D60261 llvm-svn: 363035
This commit is contained in:
parent
9315b83b7d
commit
e1e23467af
@ -13733,37 +13733,34 @@ Arguments:
|
||||
""""""""""
|
||||
The argument to this intrinsic must be a vector of integer values.
|
||||
|
||||
'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
'``llvm.experimental.vector.reduce.v2.fadd.*``' Intrinsic
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
"""""""
|
||||
|
||||
::
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %a)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %start_value, <2 x double> %a)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating-point
|
||||
The '``llvm.experimental.vector.reduce.v2.fadd.*``' intrinsics do a floating-point
|
||||
``ADD`` reduction of a vector, returning the result as a scalar. The return type
|
||||
matches the element-type of the vector input.
|
||||
|
||||
If the intrinsic call has fast-math flags, then the reduction will not preserve
|
||||
the associativity of an equivalent scalarized counterpart. If it does not have
|
||||
fast-math flags, then the reduction will be *ordered*, implying that the
|
||||
operation respects the associativity of a scalarized reduction.
|
||||
If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
|
||||
reduction will not preserve the associativity of an equivalent scalarized
|
||||
counterpart. Otherwise the reduction will be *ordered*, thus implying that
|
||||
the operation respects the associativity of a scalarized reduction.
|
||||
|
||||
|
||||
Arguments:
|
||||
""""""""""
|
||||
The first argument to this intrinsic is a scalar accumulator value, which is
|
||||
only used when there are no fast-math flags attached. This argument may be undef
|
||||
when fast-math flags are used. The type of the accumulator matches the
|
||||
element-type of the vector input.
|
||||
|
||||
The first argument to this intrinsic is a scalar start value for the reduction.
|
||||
The type of the start value matches the element-type of the vector input.
|
||||
The second argument must be a vector of floating-point values.
|
||||
|
||||
Examples:
|
||||
@ -13771,8 +13768,8 @@ Examples:
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
%fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
|
||||
%ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
|
||||
%unord = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction
|
||||
%ord = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
|
||||
|
||||
|
||||
'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
|
||||
@ -13797,37 +13794,34 @@ Arguments:
|
||||
""""""""""
|
||||
The argument to this intrinsic must be a vector of integer values.
|
||||
|
||||
'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
'``llvm.experimental.vector.reduce.v2.fmul.*``' Intrinsic
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
"""""""
|
||||
|
||||
::
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a)
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %a)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %start_value, <2 x double> %a)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating-point
|
||||
The '``llvm.experimental.vector.reduce.v2.fmul.*``' intrinsics do a floating-point
|
||||
``MUL`` reduction of a vector, returning the result as a scalar. The return type
|
||||
matches the element-type of the vector input.
|
||||
|
||||
If the intrinsic call has fast-math flags, then the reduction will not preserve
|
||||
the associativity of an equivalent scalarized counterpart. If it does not have
|
||||
fast-math flags, then the reduction will be *ordered*, implying that the
|
||||
operation respects the associativity of a scalarized reduction.
|
||||
If the intrinsic call has the 'reassoc' or 'fast' flags set, then the
|
||||
reduction will not preserve the associativity of an equivalent scalarized
|
||||
counterpart. Otherwise the reduction will be *ordered*, thus implying that
|
||||
the operation respects the associativity of a scalarized reduction.
|
||||
|
||||
|
||||
Arguments:
|
||||
""""""""""
|
||||
The first argument to this intrinsic is a scalar accumulator value, which is
|
||||
only used when there are no fast-math flags attached. This argument may be undef
|
||||
when fast-math flags are used. The type of the accumulator matches the
|
||||
element-type of the vector input.
|
||||
|
||||
The first argument to this intrinsic is a scalar start value for the reduction.
|
||||
The type of the start value matches the element-type of the vector input.
|
||||
The second argument must be a vector of floating-point values.
|
||||
|
||||
Examples:
|
||||
@ -13835,8 +13829,8 @@ Examples:
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
%fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
|
||||
%ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
|
||||
%unord = call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction
|
||||
%ord = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction
|
||||
|
||||
'``llvm.experimental.vector.reduce.and.*``' Intrinsic
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -1070,8 +1070,8 @@ public:
|
||||
case Intrinsic::experimental_vector_reduce_and:
|
||||
case Intrinsic::experimental_vector_reduce_or:
|
||||
case Intrinsic::experimental_vector_reduce_xor:
|
||||
case Intrinsic::experimental_vector_reduce_fadd:
|
||||
case Intrinsic::experimental_vector_reduce_fmul:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fadd:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fmul:
|
||||
case Intrinsic::experimental_vector_reduce_smax:
|
||||
case Intrinsic::experimental_vector_reduce_smin:
|
||||
case Intrinsic::experimental_vector_reduce_fmax:
|
||||
@ -1261,12 +1261,16 @@ public:
|
||||
case Intrinsic::experimental_vector_reduce_xor:
|
||||
return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0],
|
||||
/*IsPairwiseForm=*/false);
|
||||
case Intrinsic::experimental_vector_reduce_fadd:
|
||||
return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0],
|
||||
/*IsPairwiseForm=*/false);
|
||||
case Intrinsic::experimental_vector_reduce_fmul:
|
||||
return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0],
|
||||
/*IsPairwiseForm=*/false);
|
||||
case Intrinsic::experimental_vector_reduce_v2_fadd:
|
||||
return ConcreteTTI->getArithmeticReductionCost(
|
||||
Instruction::FAdd, Tys[0],
|
||||
/*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict
|
||||
// reductions.
|
||||
case Intrinsic::experimental_vector_reduce_v2_fmul:
|
||||
return ConcreteTTI->getArithmeticReductionCost(
|
||||
Instruction::FMul, Tys[0],
|
||||
/*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict
|
||||
// reductions.
|
||||
case Intrinsic::experimental_vector_reduce_smax:
|
||||
case Intrinsic::experimental_vector_reduce_smin:
|
||||
case Intrinsic::experimental_vector_reduce_fmax:
|
||||
|
@ -1140,14 +1140,14 @@ def int_memset_element_unordered_atomic
|
||||
|
||||
//===------------------------ Reduction Intrinsics ------------------------===//
|
||||
//
|
||||
def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>,
|
||||
llvm_anyvector_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>,
|
||||
llvm_anyvector_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>,
|
||||
llvm_anyvector_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>,
|
||||
llvm_anyvector_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
|
||||
[llvm_anyvector_ty],
|
||||
[IntrNoMem]>;
|
||||
|
@ -29,9 +29,9 @@ namespace {
|
||||
|
||||
unsigned getOpcode(Intrinsic::ID ID) {
|
||||
switch (ID) {
|
||||
case Intrinsic::experimental_vector_reduce_fadd:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fadd:
|
||||
return Instruction::FAdd;
|
||||
case Intrinsic::experimental_vector_reduce_fmul:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fmul:
|
||||
return Instruction::FMul;
|
||||
case Intrinsic::experimental_vector_reduce_add:
|
||||
return Instruction::Add;
|
||||
@ -83,22 +83,33 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
|
||||
Worklist.push_back(II);
|
||||
|
||||
for (auto *II : Worklist) {
|
||||
if (!TTI->shouldExpandReduction(II))
|
||||
continue;
|
||||
|
||||
FastMathFlags FMF =
|
||||
isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
|
||||
Intrinsic::ID ID = II->getIntrinsicID();
|
||||
RecurrenceDescriptor::MinMaxRecurrenceKind MRK = getMRK(ID);
|
||||
|
||||
Value *Rdx = nullptr;
|
||||
IRBuilder<> Builder(II);
|
||||
bool IsOrdered = false;
|
||||
Value *Acc = nullptr;
|
||||
Value *Vec = nullptr;
|
||||
auto ID = II->getIntrinsicID();
|
||||
auto MRK = RecurrenceDescriptor::MRK_Invalid;
|
||||
IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
|
||||
Builder.setFastMathFlags(FMF);
|
||||
switch (ID) {
|
||||
case Intrinsic::experimental_vector_reduce_fadd:
|
||||
case Intrinsic::experimental_vector_reduce_fmul:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fadd:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fmul: {
|
||||
// FMFs must be attached to the call, otherwise it's an ordered reduction
|
||||
// and it can't be handled by generating a shuffle sequence.
|
||||
if (!II->getFastMathFlags().isFast())
|
||||
IsOrdered = true;
|
||||
Acc = II->getArgOperand(0);
|
||||
Vec = II->getArgOperand(1);
|
||||
break;
|
||||
Value *Acc = II->getArgOperand(0);
|
||||
Value *Vec = II->getArgOperand(1);
|
||||
if (!FMF.allowReassoc())
|
||||
Rdx = getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK);
|
||||
else {
|
||||
Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
|
||||
Rdx = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(ID),
|
||||
Acc, Rdx, "bin.rdx");
|
||||
}
|
||||
} break;
|
||||
case Intrinsic::experimental_vector_reduce_add:
|
||||
case Intrinsic::experimental_vector_reduce_mul:
|
||||
case Intrinsic::experimental_vector_reduce_and:
|
||||
@ -109,23 +120,13 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
|
||||
case Intrinsic::experimental_vector_reduce_umax:
|
||||
case Intrinsic::experimental_vector_reduce_umin:
|
||||
case Intrinsic::experimental_vector_reduce_fmax:
|
||||
case Intrinsic::experimental_vector_reduce_fmin:
|
||||
Vec = II->getArgOperand(0);
|
||||
MRK = getMRK(ID);
|
||||
break;
|
||||
case Intrinsic::experimental_vector_reduce_fmin: {
|
||||
Value *Vec = II->getArgOperand(0);
|
||||
Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
|
||||
} break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
if (!TTI->shouldExpandReduction(II))
|
||||
continue;
|
||||
// Propagate FMF using the builder.
|
||||
FastMathFlags FMF =
|
||||
isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
|
||||
IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
|
||||
Builder.setFastMathFlags(FMF);
|
||||
Value *Rdx =
|
||||
IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK)
|
||||
: getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
|
||||
II->replaceAllUsesWith(Rdx);
|
||||
II->eraseFromParent();
|
||||
Changed = true;
|
||||
|
@ -6736,8 +6736,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
|
||||
LowerDeoptimizeCall(&I);
|
||||
return;
|
||||
|
||||
case Intrinsic::experimental_vector_reduce_fadd:
|
||||
case Intrinsic::experimental_vector_reduce_fmul:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fadd:
|
||||
case Intrinsic::experimental_vector_reduce_v2_fmul:
|
||||
case Intrinsic::experimental_vector_reduce_add:
|
||||
case Intrinsic::experimental_vector_reduce_mul:
|
||||
case Intrinsic::experimental_vector_reduce_and:
|
||||
@ -8795,15 +8795,17 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
|
||||
FMF = I.getFastMathFlags();
|
||||
|
||||
switch (Intrinsic) {
|
||||
case Intrinsic::experimental_vector_reduce_fadd:
|
||||
if (FMF.isFast())
|
||||
Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
|
||||
case Intrinsic::experimental_vector_reduce_v2_fadd:
|
||||
if (FMF.allowReassoc())
|
||||
Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
|
||||
DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2));
|
||||
else
|
||||
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
|
||||
break;
|
||||
case Intrinsic::experimental_vector_reduce_fmul:
|
||||
if (FMF.isFast())
|
||||
Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
|
||||
case Intrinsic::experimental_vector_reduce_v2_fmul:
|
||||
if (FMF.allowReassoc())
|
||||
Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
|
||||
DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2));
|
||||
else
|
||||
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
|
||||
break;
|
||||
|
@ -602,6 +602,26 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'e': {
|
||||
SmallVector<StringRef, 2> Groups;
|
||||
Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+");
|
||||
if (R.match(Name, &Groups)) {
|
||||
Intrinsic::ID ID = Intrinsic::not_intrinsic;
|
||||
if (Groups[1] == "fadd")
|
||||
ID = Intrinsic::experimental_vector_reduce_v2_fadd;
|
||||
if (Groups[1] == "fmul")
|
||||
ID = Intrinsic::experimental_vector_reduce_v2_fmul;
|
||||
|
||||
if (ID != Intrinsic::not_intrinsic) {
|
||||
rename(F);
|
||||
auto Args = F->getFunctionType()->params();
|
||||
Type *Tys[] = {F->getFunctionType()->getReturnType(), Args[1]};
|
||||
NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'i':
|
||||
case 'l': {
|
||||
bool IsLifetimeStart = Name.startswith("lifetime.start");
|
||||
@ -3467,7 +3487,28 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
||||
DefaultCase();
|
||||
return;
|
||||
}
|
||||
|
||||
case Intrinsic::experimental_vector_reduce_v2_fmul: {
|
||||
SmallVector<Value *, 2> Args;
|
||||
if (CI->isFast())
|
||||
Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0));
|
||||
else
|
||||
Args.push_back(CI->getOperand(0));
|
||||
Args.push_back(CI->getOperand(1));
|
||||
NewCall = Builder.CreateCall(NewFn, Args);
|
||||
cast<Instruction>(NewCall)->copyFastMathFlags(CI);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::experimental_vector_reduce_v2_fadd: {
|
||||
SmallVector<Value *, 2> Args;
|
||||
if (CI->isFast())
|
||||
Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType()));
|
||||
else
|
||||
Args.push_back(CI->getOperand(0));
|
||||
Args.push_back(CI->getOperand(1));
|
||||
NewCall = Builder.CreateCall(NewFn, Args);
|
||||
cast<Instruction>(NewCall)->copyFastMathFlags(CI);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::arm_neon_vld1:
|
||||
case Intrinsic::arm_neon_vld2:
|
||||
case Intrinsic::arm_neon_vld3:
|
||||
|
@ -323,7 +323,7 @@ CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
|
||||
Value *Ops[] = {Acc, Src};
|
||||
Type *Tys[] = {Acc->getType(), Src->getType()};
|
||||
auto Decl = Intrinsic::getDeclaration(
|
||||
M, Intrinsic::experimental_vector_reduce_fadd, Tys);
|
||||
M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys);
|
||||
return createCallHelper(Decl, Ops, this);
|
||||
}
|
||||
|
||||
@ -332,7 +332,7 @@ CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
|
||||
Value *Ops[] = {Acc, Src};
|
||||
Type *Tys[] = {Acc->getType(), Src->getType()};
|
||||
auto Decl = Intrinsic::getDeclaration(
|
||||
M, Intrinsic::experimental_vector_reduce_fmul, Tys);
|
||||
M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys);
|
||||
return createCallHelper(Decl, Ops, this);
|
||||
}
|
||||
|
||||
|
@ -801,13 +801,9 @@ Value *llvm::createSimpleTargetReduction(
|
||||
ArrayRef<Value *> RedOps) {
|
||||
assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
|
||||
|
||||
Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
|
||||
std::function<Value *()> BuildFunc;
|
||||
using RD = RecurrenceDescriptor;
|
||||
RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
|
||||
// TODO: Support creating ordered reductions.
|
||||
FastMathFlags FMFFast;
|
||||
FMFFast.setFast();
|
||||
|
||||
switch (Opcode) {
|
||||
case Instruction::Add:
|
||||
@ -827,15 +823,15 @@ Value *llvm::createSimpleTargetReduction(
|
||||
break;
|
||||
case Instruction::FAdd:
|
||||
BuildFunc = [&]() {
|
||||
auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
|
||||
cast<CallInst>(Rdx)->setFastMathFlags(FMFFast);
|
||||
auto Rdx = Builder.CreateFAddReduce(
|
||||
Constant::getNullValue(Src->getType()->getVectorElementType()), Src);
|
||||
return Rdx;
|
||||
};
|
||||
break;
|
||||
case Instruction::FMul:
|
||||
BuildFunc = [&]() {
|
||||
auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
|
||||
cast<CallInst>(Rdx)->setFastMathFlags(FMFFast);
|
||||
Type *Ty = Src->getType()->getVectorElementType();
|
||||
auto Rdx = Builder.CreateFMulReduce(ConstantFP::get(Ty, 1.0), Src);
|
||||
return Rdx;
|
||||
};
|
||||
break;
|
||||
|
@ -1,34 +1,34 @@
|
||||
; RUN: not opt -S < %s 2>&1 | FileCheck %s
|
||||
|
||||
; CHECK: Intrinsic has incorrect argument type!
|
||||
; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64
|
||||
; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64
|
||||
define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) {
|
||||
%res = call float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
|
||||
%res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
; CHECK: Intrinsic has incorrect argument type!
|
||||
; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64
|
||||
; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64
|
||||
define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) {
|
||||
%res = call double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
|
||||
%res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
|
||||
ret double %res
|
||||
}
|
||||
|
||||
; CHECK: Intrinsic has incorrect argument type!
|
||||
; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64
|
||||
; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64
|
||||
define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) {
|
||||
%res = call <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
|
||||
%res = call <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
; CHECK: Intrinsic has incorrect argument type!
|
||||
; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64
|
||||
; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64
|
||||
define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) {
|
||||
%res = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
|
||||
%res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
|
||||
ret double %res
|
||||
}
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
|
||||
declare <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
|
||||
declare <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
|
||||
|
64
test/Bitcode/upgrade-vecreduce-intrinsics.ll
Normal file
64
test/Bitcode/upgrade-vecreduce-intrinsics.ll
Normal file
@ -0,0 +1,64 @@
|
||||
; RUN: opt -S < %s | FileCheck %s
|
||||
; RUN: llvm-dis < %s.bc | FileCheck %s
|
||||
|
||||
define float @fadd_acc(<4 x float> %in, float %acc) {
|
||||
; CHECK-LABEL: @fadd_acc
|
||||
; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in)
|
||||
%res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @fadd_undef(<4 x float> %in) {
|
||||
; CHECK-LABEL: @fadd_undef
|
||||
; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %in)
|
||||
%res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @fadd_fast_acc(<4 x float> %in, float %acc) {
|
||||
; CHECK-LABEL: @fadd_fast_acc
|
||||
; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
|
||||
%res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @fadd_fast_undef(<4 x float> %in) {
|
||||
; CHECK-LABEL: @fadd_fast_undef
|
||||
; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in)
|
||||
%res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @fmul_acc(<4 x float> %in, float %acc) {
|
||||
; CHECK-LABEL: @fmul_acc
|
||||
; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in)
|
||||
%res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @fmul_undef(<4 x float> %in) {
|
||||
; CHECK-LABEL: @fmul_undef
|
||||
; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %in)
|
||||
%res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @fmul_fast_acc(<4 x float> %in, float %acc) {
|
||||
; CHECK-LABEL: @fmul_fast_acc
|
||||
; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
|
||||
%res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @fmul_fast_undef(<4 x float> %in) {
|
||||
; CHECK-LABEL: @fmul_fast_undef
|
||||
; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in)
|
||||
%res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
||||
; CHECK: declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
|
||||
; CHECK: declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
|
BIN
test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc
Normal file
BIN
test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc
Normal file
Binary file not shown.
@ -1,20 +1,20 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
|
||||
|
||||
declare half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half, <1 x half>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float, <1 x float>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double, <1 x double>)
|
||||
declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>)
|
||||
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
|
||||
declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float, <3 x float>)
|
||||
declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
|
||||
declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
|
||||
|
||||
define half @test_v1f16(<1 x half> %a) nounwind {
|
||||
; CHECK-LABEL: test_v1f16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ret
|
||||
%b = call fast nnan half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a)
|
||||
%b = call fast nnan half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
|
||||
ret half %b
|
||||
}
|
||||
|
||||
@ -24,7 +24,7 @@ define float @test_v1f32(<1 x float> %a) nounwind {
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
|
||||
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
|
||||
; CHECK-NEXT: ret
|
||||
%b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a)
|
||||
%b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
|
||||
ret float %b
|
||||
}
|
||||
|
||||
@ -32,7 +32,7 @@ define double @test_v1f64(<1 x double> %a) nounwind {
|
||||
; CHECK-LABEL: test_v1f64:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ret
|
||||
%b = call fast nnan double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a)
|
||||
%b = call fast nnan double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
|
||||
ret double %b
|
||||
}
|
||||
|
||||
@ -40,7 +40,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
|
||||
; CHECK-LABEL: test_v1f128:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ret
|
||||
%b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
|
||||
%b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
|
||||
ret fp128 %b
|
||||
}
|
||||
|
||||
@ -53,7 +53,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
|
||||
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
|
||||
; CHECK-NEXT: faddp s0, v0.2s
|
||||
; CHECK-NEXT: ret
|
||||
%b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a)
|
||||
%b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
|
||||
ret float %b
|
||||
}
|
||||
|
||||
@ -64,7 +64,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
|
||||
; CHECK-NEXT: bl __addtf3
|
||||
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
|
||||
%b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
|
||||
ret fp128 %b
|
||||
}
|
||||
|
||||
@ -78,6 +78,6 @@ define float @test_v16f32(<16 x float> %a) nounwind {
|
||||
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
|
||||
; CHECK-NEXT: faddp s0, v0.2s
|
||||
; CHECK-NEXT: ret
|
||||
%b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a)
|
||||
%b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
|
||||
ret float %b
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ define float @add_HalfS(<2 x float> %bin.rdx) {
|
||||
; CHECK-LABEL: add_HalfS:
|
||||
; CHECK: faddp s0, v0.2s
|
||||
; CHECK-NEXT: ret
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -23,7 +23,7 @@ define half @add_HalfH(<4 x half> %bin.rdx) {
|
||||
; CHECKNOFP16-NOT: fadd h{{[0-9]+}}
|
||||
; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h
|
||||
; CHECKNOFP16: ret
|
||||
%r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx)
|
||||
%r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx)
|
||||
ret half %r
|
||||
}
|
||||
|
||||
@ -45,7 +45,7 @@ define half @add_H(<8 x half> %bin.rdx) {
|
||||
; CHECKNOFP16-NOT: fadd h{{[0-9]+}}
|
||||
; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h
|
||||
; CHECKNOFP16: ret
|
||||
%r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx)
|
||||
%r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx)
|
||||
ret half %r
|
||||
}
|
||||
|
||||
@ -55,7 +55,7 @@ define float @add_S(<4 x float> %bin.rdx) {
|
||||
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
|
||||
; CHECK-NEXT: faddp s0, v0.2s
|
||||
; CHECK-NEXT: ret
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -63,7 +63,7 @@ define double @add_D(<2 x double> %bin.rdx) {
|
||||
; CHECK-LABEL: add_D:
|
||||
; CHECK: faddp d0, v0.2d
|
||||
; CHECK-NEXT: ret
|
||||
%r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx)
|
||||
%r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
@ -84,7 +84,7 @@ define half @add_2H(<16 x half> %bin.rdx) {
|
||||
; CHECKNOFP16-NOT: fadd h{{[0-9]+}}
|
||||
; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h
|
||||
; CHECKNOFP16: ret
|
||||
%r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx)
|
||||
%r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx)
|
||||
ret half %r
|
||||
}
|
||||
|
||||
@ -95,7 +95,7 @@ define float @add_2S(<8 x float> %bin.rdx) {
|
||||
; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
|
||||
; CHECK-NEXT: faddp s0, v0.2s
|
||||
; CHECK-NEXT: ret
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -104,16 +104,16 @@ define double @add_2D(<4 x double> %bin.rdx) {
|
||||
; CHECK: fadd v0.2d, v0.2d, v1.2d
|
||||
; CHECK-NEXT: faddp d0, v0.2d
|
||||
; CHECK-NEXT: ret
|
||||
%r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx)
|
||||
%r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
|
||||
declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
|
||||
declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
|
||||
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
|
||||
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)
|
||||
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
|
||||
|
@ -7,8 +7,8 @@ declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
|
||||
declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
|
||||
declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
|
||||
|
||||
declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
|
||||
declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
|
||||
@ -92,10 +92,11 @@ define float @fadd_f32(<4 x float> %vec) {
|
||||
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
|
||||
; CHECK-NEXT: ret float [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float 0.000000e+00, [[TMP0]]
|
||||
; CHECK-NEXT: ret float [[TMP1]]
|
||||
;
|
||||
entry:
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -107,10 +108,11 @@ define float @fadd_f32_accum(float %accum, <4 x float> %vec) {
|
||||
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
|
||||
; CHECK-NEXT: ret float [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float %accum, [[TMP0]]
|
||||
; CHECK-NEXT: ret float [[TMP1]]
|
||||
;
|
||||
entry:
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -128,7 +130,7 @@ define float @fadd_f32_strict(<4 x float> %vec) {
|
||||
; CHECK-NEXT: ret float [[BIN_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
|
||||
%r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -146,7 +148,7 @@ define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {
|
||||
; CHECK-NEXT: ret float [[BIN_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
%r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -158,10 +160,11 @@ define float @fmul_f32(<4 x float> %vec) {
|
||||
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
|
||||
; CHECK-NEXT: ret float [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float 1.000000e+00, [[TMP0]]
|
||||
; CHECK-NEXT: ret float [[TMP1]]
|
||||
;
|
||||
entry:
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -173,10 +176,11 @@ define float @fmul_f32_accum(float %accum, <4 x float> %vec) {
|
||||
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
|
||||
; CHECK-NEXT: ret float [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float %accum, [[TMP0]]
|
||||
; CHECK-NEXT: ret float [[TMP1]]
|
||||
;
|
||||
entry:
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -194,7 +198,7 @@ define float @fmul_f32_strict(<4 x float> %vec) {
|
||||
; CHECK-NEXT: ret float [[BIN_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
|
||||
%r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -212,7 +216,7 @@ define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {
|
||||
; CHECK-NEXT: ret float [[BIN_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
%r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
|
@ -1628,8 +1628,8 @@ define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1,
|
||||
; Repeat tests from general reductions to verify output for hoppy targets:
|
||||
; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
|
||||
|
||||
define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
|
||||
; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
|
||||
@ -1638,40 +1638,44 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
|
||||
; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE3-SLOW-NEXT: addss %xmm2, %xmm0
|
||||
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSE3-SLOW-NEXT: addss %xmm2, %xmm1
|
||||
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
|
||||
; SSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE3-FAST-LABEL: fadd_reduce_v8f32:
|
||||
; SSE3-FAST: # %bb.0:
|
||||
; SSE3-FAST-NEXT: addps %xmm2, %xmm1
|
||||
; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE3-FAST-NEXT: addps %xmm1, %xmm0
|
||||
; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
|
||||
; SSE3-FAST-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE3-FAST-NEXT: addps %xmm1, %xmm2
|
||||
; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
|
||||
; SSE3-FAST-NEXT: addss %xmm2, %xmm0
|
||||
; SSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: fadd_reduce_v8f32:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vzeroupper
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: fadd_reduce_v8f32:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vzeroupper
|
||||
; AVX-FAST-NEXT: retq
|
||||
%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
%r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
@ -1679,35 +1683,38 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
|
||||
; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
|
||||
; SSE3-SLOW: # %bb.0:
|
||||
; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
|
||||
; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2
|
||||
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2
|
||||
; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE3-FAST-LABEL: fadd_reduce_v4f64:
|
||||
; SSE3-FAST: # %bb.0:
|
||||
; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE3-FAST-NEXT: addpd %xmm2, %xmm0
|
||||
; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
|
||||
; SSE3-FAST-NEXT: addpd %xmm2, %xmm1
|
||||
; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1
|
||||
; SSE3-FAST-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: fadd_reduce_v4f64:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vzeroupper
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: fadd_reduce_v4f64:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX-FAST-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vzeroupper
|
||||
; AVX-FAST-NEXT: retq
|
||||
%r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
%r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
|
@ -14,40 +14,46 @@
|
||||
define float @test_v2f32(float %a0, <2 x float> %a1) {
|
||||
; SSE2-LABEL: test_v2f32:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
|
||||
; SSE2-NEXT: addss %xmm1, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
|
||||
; SSE2-NEXT: addss %xmm1, %xmm2
|
||||
; SSE2-NEXT: addss %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v2f32:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm1, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm1, %xmm2
|
||||
; SSE41-NEXT: addss %xmm2, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v2f32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v2f32:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v2f32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v2f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -57,9 +63,10 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE2-NEXT: addps %xmm1, %xmm2
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: addss %xmm2, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: addss %xmm2, %xmm1
|
||||
; SSE2-NEXT: addss %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v4f32:
|
||||
@ -67,41 +74,46 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE41-NEXT: addps %xmm1, %xmm2
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm2, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm2, %xmm1
|
||||
; SSE41-NEXT: addss %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v4f32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v4f32:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX1-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v4f32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v4f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -112,9 +124,10 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE2-NEXT: addps %xmm1, %xmm2
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: addss %xmm2, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: addss %xmm2, %xmm1
|
||||
; SSE2-NEXT: addss %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v8f32:
|
||||
@ -123,53 +136,58 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE41-NEXT: addps %xmm1, %xmm2
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm2, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm2, %xmm1
|
||||
; SSE41-NEXT: addss %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v8f32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vzeroupper
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v8f32:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vzeroupper
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v8f32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v8f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -182,9 +200,10 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE2-NEXT: addps %xmm1, %xmm2
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: addss %xmm2, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: addss %xmm2, %xmm1
|
||||
; SSE2-NEXT: addss %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v16f32:
|
||||
@ -195,58 +214,63 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE41-NEXT: addps %xmm1, %xmm2
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm2, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm2, %xmm1
|
||||
; SSE41-NEXT: addss %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v16f32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm0
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vzeroupper
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v16f32:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm0
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vzeroupper
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v16f32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v16f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
|
||||
; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
|
||||
; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -291,7 +315,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -346,7 +370,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -415,7 +439,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -493,7 +517,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -538,7 +562,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -593,7 +617,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -662,7 +686,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -740,7 +764,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -751,34 +775,39 @@ define float @test_v16f32_undef(<16 x float> %a0) {
|
||||
define double @test_v2f64(double %a0, <2 x double> %a1) {
|
||||
; SSE-LABEL: test_v2f64:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE-NEXT: movapd %xmm1, %xmm2
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE-NEXT: addsd %xmm1, %xmm2
|
||||
; SSE-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v2f64:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v2f64:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v2f64:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v2f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -786,46 +815,51 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
|
||||
; SSE-LABEL: test_v4f64:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: addpd %xmm2, %xmm1
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE-NEXT: movapd %xmm1, %xmm2
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE-NEXT: addsd %xmm1, %xmm2
|
||||
; SSE-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v4f64:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vzeroupper
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v4f64:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vzeroupper
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v4f64:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v4f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX512-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -835,51 +869,56 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
|
||||
; SSE-NEXT: addpd %xmm4, %xmm2
|
||||
; SSE-NEXT: addpd %xmm3, %xmm1
|
||||
; SSE-NEXT: addpd %xmm2, %xmm1
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE-NEXT: movapd %xmm1, %xmm2
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE-NEXT: addsd %xmm1, %xmm2
|
||||
; SSE-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v8f64:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm0
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vzeroupper
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v8f64:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm0
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vzeroupper
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v8f64:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v8f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
|
||||
; AVX512-NEXT: vaddpd %zmm0, %zmm1, %zmm0
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
|
||||
; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -893,58 +932,63 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
|
||||
; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4
|
||||
; SSE-NEXT: addpd %xmm2, %xmm4
|
||||
; SSE-NEXT: addpd %xmm1, %xmm4
|
||||
; SSE-NEXT: movapd %xmm4, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
|
||||
; SSE-NEXT: addsd %xmm4, %xmm0
|
||||
; SSE-NEXT: movapd %xmm4, %xmm1
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
|
||||
; SSE-NEXT: addsd %xmm4, %xmm1
|
||||
; SSE-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: test_v16f64:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm4, %ymm2, %ymm0
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm4, %ymm2, %ymm2
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1
|
||||
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vzeroupper
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: test_v16f64:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm4, %ymm2, %ymm0
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm4, %ymm2, %ymm2
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1
|
||||
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vzeroupper
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_v16f64:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v16f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm0
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
|
||||
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
|
||||
; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -983,7 +1027,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1031,7 +1075,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1086,7 +1130,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1151,7 +1195,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1190,7 +1234,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1238,7 +1282,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1293,7 +1337,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1358,16 +1402,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
|
||||
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>)
|
||||
|
@ -39,7 +39,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -90,7 +90,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
|
||||
; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -176,7 +176,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -327,7 +327,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
|
||||
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -367,7 +367,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -422,7 +422,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -512,7 +512,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -667,7 +667,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -699,7 +699,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float undef, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -746,7 +746,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -828,7 +828,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float undef, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -975,7 +975,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float undef, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -1004,7 +1004,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1042,7 +1042,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1101,7 +1101,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1202,7 +1202,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1234,7 +1234,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1275,7 +1275,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1337,7 +1337,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1440,7 +1440,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1466,7 +1466,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double undef, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1501,7 +1501,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double undef, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1557,7 +1557,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double undef, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1654,16 +1654,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double undef, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
|
||||
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>)
|
||||
|
@ -13,29 +13,33 @@
|
||||
define float @test_v2f32(float %a0, <2 x float> %a1) {
|
||||
; SSE2-LABEL: test_v2f32:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
|
||||
; SSE2-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
|
||||
; SSE2-NEXT: mulss %xmm1, %xmm2
|
||||
; SSE2-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v2f32:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm1, %xmm2
|
||||
; SSE41-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v2f32:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v2f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -45,9 +49,10 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE2-NEXT: mulps %xmm1, %xmm2
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: mulss %xmm2, %xmm1
|
||||
; SSE2-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v4f32:
|
||||
@ -55,26 +60,29 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE41-NEXT: mulps %xmm1, %xmm2
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm2, %xmm1
|
||||
; SSE41-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v4f32:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v4f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -85,9 +93,10 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE2-NEXT: mulps %xmm1, %xmm2
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: mulss %xmm2, %xmm1
|
||||
; SSE2-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v8f32:
|
||||
@ -96,32 +105,35 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE41-NEXT: mulps %xmm1, %xmm2
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm2, %xmm1
|
||||
; SSE41-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v8f32:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v8f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -134,9 +146,10 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE2-NEXT: mulps %xmm1, %xmm2
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
|
||||
; SSE2-NEXT: mulss %xmm2, %xmm1
|
||||
; SSE2-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v16f32:
|
||||
@ -147,35 +160,38 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm2
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE41-NEXT: mulps %xmm1, %xmm2
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm2, %xmm0
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSE41-NEXT: mulss %xmm2, %xmm1
|
||||
; SSE41-NEXT: mulss %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v16f32:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm0
|
||||
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
|
||||
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v16f32:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
|
||||
; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
|
||||
; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -209,7 +225,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -249,7 +265,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -297,7 +313,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -352,7 +368,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -386,7 +402,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -426,7 +442,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -474,7 +490,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -529,7 +545,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
|
||||
%1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -540,23 +556,26 @@ define float @test_v16f32_undef(<16 x float> %a0) {
|
||||
define double @test_v2f64(double %a0, <2 x double> %a1) {
|
||||
; SSE-LABEL: test_v2f64:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE-NEXT: movapd %xmm1, %xmm2
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm2
|
||||
; SSE-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v2f64:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v2f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -564,29 +583,32 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
|
||||
; SSE-LABEL: test_v4f64:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: mulpd %xmm2, %xmm1
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE-NEXT: movapd %xmm1, %xmm2
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm2
|
||||
; SSE-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v4f64:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v4f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -596,32 +618,35 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
|
||||
; SSE-NEXT: mulpd %xmm4, %xmm2
|
||||
; SSE-NEXT: mulpd %xmm3, %xmm1
|
||||
; SSE-NEXT: mulpd %xmm2, %xmm1
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE-NEXT: movapd %xmm1, %xmm2
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm2
|
||||
; SSE-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v8f64:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm0
|
||||
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1
|
||||
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v8f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
|
||||
; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
|
||||
; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -635,35 +660,38 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
|
||||
; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4
|
||||
; SSE-NEXT: mulpd %xmm2, %xmm4
|
||||
; SSE-NEXT: mulpd %xmm1, %xmm4
|
||||
; SSE-NEXT: movapd %xmm4, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
|
||||
; SSE-NEXT: mulsd %xmm4, %xmm0
|
||||
; SSE-NEXT: movapd %xmm4, %xmm1
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
|
||||
; SSE-NEXT: mulsd %xmm4, %xmm1
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v16f64:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm0
|
||||
; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm2
|
||||
; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1
|
||||
; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0
|
||||
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1
|
||||
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: test_v16f64:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm0
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
|
||||
; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2
|
||||
; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -691,7 +719,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -722,7 +750,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -758,7 +786,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -800,7 +828,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -828,7 +856,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -859,7 +887,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -895,7 +923,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -937,16 +965,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
|
||||
%1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
|
||||
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)
|
||||
|
@ -38,7 +38,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -89,7 +89,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
|
||||
; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -175,7 +175,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -326,7 +326,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -360,7 +360,7 @@ define float @test_v2f32_one(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -407,7 +407,7 @@ define float @test_v4f32_one(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -489,7 +489,7 @@ define float @test_v8f32_one(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -636,7 +636,7 @@ define float @test_v16f32_one(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -668,7 +668,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
|
||||
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float undef, <2 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -715,7 +715,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
|
||||
; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -797,7 +797,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float undef, <8 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -944,7 +944,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
|
||||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
|
||||
%1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float undef, <16 x float> %a0)
|
||||
ret float %1
|
||||
}
|
||||
|
||||
@ -973,7 +973,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1011,7 +1011,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1070,7 +1070,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1171,7 +1171,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1199,7 +1199,7 @@ define double @test_v2f64_one(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1236,7 +1236,7 @@ define double @test_v4f64_one(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1294,7 +1294,7 @@ define double @test_v8f64_one(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1392,7 +1392,7 @@ define double @test_v16f64_one(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1418,7 +1418,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
|
||||
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double undef, <2 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1453,7 +1453,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double undef, <4 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1509,7 +1509,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double undef, <8 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
@ -1606,16 +1606,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
|
||||
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
|
||||
%1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double undef, <16 x double> %a0)
|
||||
ret double %1
|
||||
}
|
||||
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
|
||||
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
|
||||
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>)
|
||||
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)
|
||||
|
Loading…
Reference in New Issue
Block a user