mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-04-03 16:21:41 +00:00
[DAG] propagate FMF for all FPMathOperators
This is a simple hack based on what's proposed in D37686, but we can extend it if needed in follow-ups. It gets us most of the FMF functionality that we want without adding any state bits to the flags. It also intentionally leaves out non-FMF flags (nsw, etc) to minimize the patch. It should provide a superset of the functionality from D46563 - the extra tests show propagation and codegen diffs for fcmp, vecreduce, and FP libcalls. The PPC log2() test shows the limits of this most basic approach - we only applied 'afn' to the last node created for the call. AFAIK, there aren't any libcall optimizations based on the flags currently, so that shouldn't make any difference. Differential Revision: https://reviews.llvm.org/D46854 llvm-svn: 332358
This commit is contained in:
parent
3ab4e9e769
commit
a66bd4e046
@ -1069,6 +1069,22 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
|
|||||||
|
|
||||||
visit(I.getOpcode(), I);
|
visit(I.getOpcode(), I);
|
||||||
|
|
||||||
|
if (auto *FPMO = dyn_cast<FPMathOperator>(&I)) {
|
||||||
|
// Propagate the fast-math-flags of this IR instruction to the DAG node that
|
||||||
|
// maps to this instruction.
|
||||||
|
// TODO: We could handle all flags (nsw, etc) here.
|
||||||
|
// TODO: If an IR instruction maps to >1 node, only the final node will have
|
||||||
|
// flags set.
|
||||||
|
if (SDNode *Node = getNodeForIRValue(&I)) {
|
||||||
|
SDNodeFlags IncomingFlags;
|
||||||
|
IncomingFlags.copyFMF(*FPMO);
|
||||||
|
if (!Node->getFlags().isDefined())
|
||||||
|
Node->setFlags(IncomingFlags);
|
||||||
|
else
|
||||||
|
Node->intersectFlagsWith(IncomingFlags);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!isa<TerminatorInst>(&I) && !HasTailCall &&
|
if (!isa<TerminatorInst>(&I) && !HasTailCall &&
|
||||||
!isStatepoint(&I)) // statepoints handle their exports internally
|
!isStatepoint(&I)) // statepoints handle their exports internally
|
||||||
CopyToExportRegsIfNeeded(&I);
|
CopyToExportRegsIfNeeded(&I);
|
||||||
@ -2753,9 +2769,6 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
|
|||||||
Flags.setVectorReduction(true);
|
Flags.setVectorReduction(true);
|
||||||
LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
|
LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
|
||||||
}
|
}
|
||||||
if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
|
|
||||||
Flags.copyFMF(*FPOp);
|
|
||||||
}
|
|
||||||
|
|
||||||
SDValue Op1 = getValue(I.getOperand(0));
|
SDValue Op1 = getValue(I.getOperand(0));
|
||||||
SDValue Op2 = getValue(I.getOperand(1));
|
SDValue Op2 = getValue(I.getOperand(1));
|
||||||
@ -2851,13 +2864,12 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
|
|||||||
predicate = FCmpInst::Predicate(FC->getPredicate());
|
predicate = FCmpInst::Predicate(FC->getPredicate());
|
||||||
SDValue Op1 = getValue(I.getOperand(0));
|
SDValue Op1 = getValue(I.getOperand(0));
|
||||||
SDValue Op2 = getValue(I.getOperand(1));
|
SDValue Op2 = getValue(I.getOperand(1));
|
||||||
ISD::CondCode Condition = getFCmpCondCode(predicate);
|
|
||||||
|
|
||||||
// FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them.
|
ISD::CondCode Condition = getFCmpCondCode(predicate);
|
||||||
// FIXME: We should propagate the fast-math-flags to the DAG node itself for
|
auto *FPMO = dyn_cast<FPMathOperator>(&I);
|
||||||
// further optimization, but currently FMF is only applicable to binary nodes.
|
if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath)
|
||||||
if (TM.Options.NoNaNsFPMath)
|
|
||||||
Condition = getFCmpCodeWithoutNaN(Condition);
|
Condition = getFCmpCodeWithoutNaN(Condition);
|
||||||
|
|
||||||
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
|
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
|
||||||
I.getType());
|
I.getType());
|
||||||
setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
|
setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
|
||||||
@ -8082,8 +8094,6 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
|
|||||||
FastMathFlags FMF;
|
FastMathFlags FMF;
|
||||||
if (isa<FPMathOperator>(I))
|
if (isa<FPMathOperator>(I))
|
||||||
FMF = I.getFastMathFlags();
|
FMF = I.getFastMathFlags();
|
||||||
SDNodeFlags SDFlags;
|
|
||||||
SDFlags.setNoNaNs(FMF.noNaNs());
|
|
||||||
|
|
||||||
switch (Intrinsic) {
|
switch (Intrinsic) {
|
||||||
case Intrinsic::experimental_vector_reduce_fadd:
|
case Intrinsic::experimental_vector_reduce_fadd:
|
||||||
@ -8126,10 +8136,10 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
|
|||||||
Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
|
Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
|
||||||
break;
|
break;
|
||||||
case Intrinsic::experimental_vector_reduce_fmax:
|
case Intrinsic::experimental_vector_reduce_fmax:
|
||||||
Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
|
Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1);
|
||||||
break;
|
break;
|
||||||
case Intrinsic::experimental_vector_reduce_fmin:
|
case Intrinsic::experimental_vector_reduce_fmin:
|
||||||
Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
|
Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
llvm_unreachable("Unhandled vector reduce intrinsic");
|
llvm_unreachable("Unhandled vector reduce intrinsic");
|
||||||
|
@ -687,6 +687,13 @@ public:
|
|||||||
SDValue getValue(const Value *V);
|
SDValue getValue(const Value *V);
|
||||||
bool findValue(const Value *V) const;
|
bool findValue(const Value *V) const;
|
||||||
|
|
||||||
|
/// Return the SDNode for the specified IR value if it exists.
|
||||||
|
SDNode *getNodeForIRValue(const Value *V) {
|
||||||
|
if (NodeMap.find(V) == NodeMap.end())
|
||||||
|
return nullptr;
|
||||||
|
return NodeMap[V].getNode();
|
||||||
|
}
|
||||||
|
|
||||||
SDValue getNonRegisterValue(const Value *V);
|
SDValue getNonRegisterValue(const Value *V);
|
||||||
SDValue getValueImpl(const Value *V);
|
SDValue getValueImpl(const Value *V);
|
||||||
|
|
||||||
|
@ -9,10 +9,9 @@
|
|||||||
; CHECK-NEXT: Analyzing result type: v4f64
|
; CHECK-NEXT: Analyzing result type: v4f64
|
||||||
; CHECK-NEXT: Split node result: [[VFOUR]]: v4f64 = BUILD_VECTOR
|
; CHECK-NEXT: Split node result: [[VFOUR]]: v4f64 = BUILD_VECTOR
|
||||||
|
|
||||||
; FIXME: We dropped the 'reassoc' flag.
|
|
||||||
; CHECK: Legalizing node: [[VTWO:t.*]]: v2f64 = BUILD_VECTOR
|
; CHECK: Legalizing node: [[VTWO:t.*]]: v2f64 = BUILD_VECTOR
|
||||||
; CHECK: Legally typed node: [[VTWO]]: v2f64 = BUILD_VECTOR
|
; CHECK: Legally typed node: [[VTWO]]: v2f64 = BUILD_VECTOR
|
||||||
; CHECK: Legalizing node: t26: v2f64 = fmaxnum nnan [[VTWO]], [[VTWO]]
|
; CHECK: Legalizing node: t26: v2f64 = fmaxnum nnan reassoc [[VTWO]], [[VTWO]]
|
||||||
|
|
||||||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
||||||
target triple = "aarch64--linux-gnu"
|
target triple = "aarch64--linux-gnu"
|
||||||
|
@ -208,7 +208,7 @@ for.end:
|
|||||||
|
|
||||||
; CHECK-LABEL: VCMPBRCC:
|
; CHECK-LABEL: VCMPBRCC:
|
||||||
|
|
||||||
; CHECK-SOFT: bl __aeabi_fcmple
|
; CHECK-SOFT: bl __aeabi_fcmpgt
|
||||||
; CHECK-SOFT: cmp r0, #0
|
; CHECK-SOFT: cmp r0, #0
|
||||||
|
|
||||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], [[S2]]
|
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], [[S2]]
|
||||||
|
@ -156,7 +156,7 @@ define float @fmul_fadd_fast2(float %x, float %y, float %z) {
|
|||||||
; This is the minimum FMF needed for this transform - the FMA allows reassociation.
|
; This is the minimum FMF needed for this transform - the FMA allows reassociation.
|
||||||
|
|
||||||
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'
|
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'
|
||||||
; FMFDEBUG: fma {{t[0-9]+}}
|
; FMFDEBUG: fma reassoc {{t[0-9]+}}
|
||||||
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:'
|
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:'
|
||||||
|
|
||||||
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'
|
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'
|
||||||
@ -192,7 +192,7 @@ define float @fmul_fma_reassoc1(float %x) {
|
|||||||
; This shouldn't change anything - the intermediate fmul result is now also flagged.
|
; This shouldn't change anything - the intermediate fmul result is now also flagged.
|
||||||
|
|
||||||
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'
|
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'
|
||||||
; FMFDEBUG: fma {{t[0-9]+}}
|
; FMFDEBUG: fma reassoc {{t[0-9]+}}
|
||||||
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:'
|
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:'
|
||||||
|
|
||||||
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'
|
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'
|
||||||
@ -228,7 +228,7 @@ define float @fmul_fma_reassoc2(float %x) {
|
|||||||
; The FMA is now fully 'fast'. This implies that reassociation is allowed.
|
; The FMA is now fully 'fast'. This implies that reassociation is allowed.
|
||||||
|
|
||||||
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'
|
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'
|
||||||
; FMFDEBUG: fma {{t[0-9]+}}
|
; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}
|
||||||
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:'
|
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:'
|
||||||
|
|
||||||
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'
|
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'
|
||||||
@ -264,7 +264,7 @@ define float @fmul_fma_fast1(float %x) {
|
|||||||
; This shouldn't change anything - the intermediate fmul result is now also flagged.
|
; This shouldn't change anything - the intermediate fmul result is now also flagged.
|
||||||
|
|
||||||
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'
|
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'
|
||||||
; FMFDEBUG: fma {{t[0-9]+}}
|
; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}
|
||||||
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:'
|
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:'
|
||||||
|
|
||||||
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'
|
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'
|
||||||
@ -300,7 +300,7 @@ define float @fmul_fma_fast2(float %x) {
|
|||||||
; Reduced precision for sqrt is allowed - should use estimate and NR iterations.
|
; Reduced precision for sqrt is allowed - should use estimate and NR iterations.
|
||||||
|
|
||||||
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'
|
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'
|
||||||
; FMFDEBUG: fsqrt {{t[0-9]+}}
|
; FMFDEBUG: fsqrt afn {{t[0-9]+}}
|
||||||
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:'
|
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:'
|
||||||
|
|
||||||
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'
|
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'
|
||||||
@ -340,7 +340,7 @@ define float @sqrt_afn(float %x) {
|
|||||||
; The call is now fully 'fast'. This implies that approximation is allowed.
|
; The call is now fully 'fast'. This implies that approximation is allowed.
|
||||||
|
|
||||||
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'
|
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'
|
||||||
; FMFDEBUG: fsqrt {{t[0-9]+}}
|
; FMFDEBUG: fsqrt nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}
|
||||||
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:'
|
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:'
|
||||||
|
|
||||||
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'
|
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'
|
||||||
@ -391,10 +391,8 @@ define double @fcmp_nnan(double %a, double %y, double %z) {
|
|||||||
; FMF-LABEL: fcmp_nnan:
|
; FMF-LABEL: fcmp_nnan:
|
||||||
; FMF: # %bb.0:
|
; FMF: # %bb.0:
|
||||||
; FMF-NEXT: xxlxor 0, 0, 0
|
; FMF-NEXT: xxlxor 0, 0, 0
|
||||||
; FMF-NEXT: fcmpu 0, 1, 1
|
; FMF-NEXT: xscmpudp 0, 1, 0
|
||||||
; FMF-NEXT: fcmpu 1, 1, 0
|
; FMF-NEXT: blt 0, .LBB12_2
|
||||||
; FMF-NEXT: cror 20, 4, 3
|
|
||||||
; FMF-NEXT: bc 12, 20, .LBB12_2
|
|
||||||
; FMF-NEXT: # %bb.1:
|
; FMF-NEXT: # %bb.1:
|
||||||
; FMF-NEXT: fmr 3, 2
|
; FMF-NEXT: fmr 3, 2
|
||||||
; FMF-NEXT: .LBB12_2:
|
; FMF-NEXT: .LBB12_2:
|
||||||
@ -421,13 +419,13 @@ define double @fcmp_nnan(double %a, double %y, double %z) {
|
|||||||
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:'
|
; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:'
|
||||||
; FMFDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64<double (double)* @log2>
|
; FMFDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64<double (double)* @log2>
|
||||||
; FMFDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1
|
; FMFDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1
|
||||||
; FMFDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1
|
; FMFDEBUG: f64,ch,glue = CopyFromReg afn t16, Register:f64 $f1, t16:1
|
||||||
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:'
|
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:'
|
||||||
|
|
||||||
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:'
|
; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:'
|
||||||
; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64<double (double)* @log2>
|
; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64<double (double)* @log2>
|
||||||
; GLOBALDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1
|
; GLOBALDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1
|
||||||
; GLOBALDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1
|
; GLOBALDEBUG: f64,ch,glue = CopyFromReg afn t16, Register:f64 $f1, t16:1
|
||||||
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:'
|
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:'
|
||||||
|
|
||||||
declare double @log2(double)
|
declare double @log2(double)
|
||||||
|
@ -285,49 +285,33 @@ define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y)
|
|||||||
ret <8 x double> %z
|
ret <8 x double> %z
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
|
; The IR-level FMF propagate to the node. With nnan, there's no need to blend.
|
||||||
|
|
||||||
define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
|
define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
|
||||||
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
|
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
|
||||||
; SSE: # %bb.0:
|
; SSE: # %bb.0:
|
||||||
; SSE-NEXT: movapd %xmm0, %xmm2
|
; SSE-NEXT: maxsd %xmm1, %xmm0
|
||||||
; SSE-NEXT: cmpunordsd %xmm0, %xmm2
|
|
||||||
; SSE-NEXT: movapd %xmm2, %xmm3
|
|
||||||
; SSE-NEXT: andpd %xmm1, %xmm3
|
|
||||||
; SSE-NEXT: maxsd %xmm0, %xmm1
|
|
||||||
; SSE-NEXT: andnpd %xmm1, %xmm2
|
|
||||||
; SSE-NEXT: orpd %xmm3, %xmm2
|
|
||||||
; SSE-NEXT: movapd %xmm2, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f64:
|
; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f64:
|
||||||
; AVX: # %bb.0:
|
; AVX: # %bb.0:
|
||||||
; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
|
; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
|
||||||
; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%r = tail call nnan double @llvm.maxnum.f64(double %a, double %b)
|
%r = tail call nnan double @llvm.maxnum.f64(double %a, double %b)
|
||||||
ret double %r
|
ret double %r
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: Make sure vectors work too.
|
; Make sure vectors work too.
|
||||||
|
|
||||||
define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %b) {
|
define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %b) {
|
||||||
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f432:
|
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f432:
|
||||||
; SSE: # %bb.0:
|
; SSE: # %bb.0:
|
||||||
; SSE-NEXT: movaps %xmm1, %xmm2
|
; SSE-NEXT: maxps %xmm1, %xmm0
|
||||||
; SSE-NEXT: maxps %xmm0, %xmm2
|
|
||||||
; SSE-NEXT: cmpunordps %xmm0, %xmm0
|
|
||||||
; SSE-NEXT: andps %xmm0, %xmm1
|
|
||||||
; SSE-NEXT: andnps %xmm2, %xmm0
|
|
||||||
; SSE-NEXT: orps %xmm1, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f432:
|
; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f432:
|
||||||
; AVX: # %bb.0:
|
; AVX: # %bb.0:
|
||||||
; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm2
|
; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
|
||||||
; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%r = tail call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
|
%r = tail call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
|
||||||
ret <4 x float> %r
|
ret <4 x float> %r
|
||||||
|
@ -277,49 +277,33 @@ define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y)
|
|||||||
ret <8 x double> %z
|
ret <8 x double> %z
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.
|
; The IR-level FMF propagate to the node. With nnan, there's no need to blend.
|
||||||
|
|
||||||
define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {
|
define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {
|
||||||
; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:
|
; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:
|
||||||
; SSE: # %bb.0:
|
; SSE: # %bb.0:
|
||||||
; SSE-NEXT: movaps %xmm0, %xmm2
|
; SSE-NEXT: minss %xmm1, %xmm0
|
||||||
; SSE-NEXT: cmpunordss %xmm0, %xmm2
|
|
||||||
; SSE-NEXT: movaps %xmm2, %xmm3
|
|
||||||
; SSE-NEXT: andps %xmm1, %xmm3
|
|
||||||
; SSE-NEXT: minss %xmm0, %xmm1
|
|
||||||
; SSE-NEXT: andnps %xmm1, %xmm2
|
|
||||||
; SSE-NEXT: orps %xmm3, %xmm2
|
|
||||||
; SSE-NEXT: movaps %xmm2, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: minnum_intrinsic_nnan_fmf_f32:
|
; AVX-LABEL: minnum_intrinsic_nnan_fmf_f32:
|
||||||
; AVX: # %bb.0:
|
; AVX: # %bb.0:
|
||||||
; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2
|
; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
|
||||||
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%r = tail call nnan float @llvm.minnum.f32(float %a, float %b)
|
%r = tail call nnan float @llvm.minnum.f32(float %a, float %b)
|
||||||
ret float %r
|
ret float %r
|
||||||
}
|
}
|
||||||
|
|
||||||
; FIXME: Make sure vectors work too.
|
; Make sure vectors work too.
|
||||||
|
|
||||||
define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x double> %b) {
|
define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x double> %b) {
|
||||||
; SSE-LABEL: minnum_intrinsic_nnan_fmf_v2f64:
|
; SSE-LABEL: minnum_intrinsic_nnan_fmf_v2f64:
|
||||||
; SSE: # %bb.0:
|
; SSE: # %bb.0:
|
||||||
; SSE-NEXT: movapd %xmm1, %xmm2
|
; SSE-NEXT: minpd %xmm1, %xmm0
|
||||||
; SSE-NEXT: minpd %xmm0, %xmm2
|
|
||||||
; SSE-NEXT: cmpunordpd %xmm0, %xmm0
|
|
||||||
; SSE-NEXT: andpd %xmm0, %xmm1
|
|
||||||
; SSE-NEXT: andnpd %xmm2, %xmm0
|
|
||||||
; SSE-NEXT: orpd %xmm1, %xmm0
|
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: minnum_intrinsic_nnan_fmf_v2f64:
|
; AVX-LABEL: minnum_intrinsic_nnan_fmf_v2f64:
|
||||||
; AVX: # %bb.0:
|
; AVX: # %bb.0:
|
||||||
; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm2
|
; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0
|
||||||
; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%r = tail call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
|
%r = tail call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
|
||||||
ret <2 x double> %r
|
ret <2 x double> %r
|
||||||
|
@ -8,9 +8,7 @@ declare <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)
|
|||||||
define <4 x double> @via_minnum(<4 x double> %x, <4 x double> %y) {
|
define <4 x double> @via_minnum(<4 x double> %x, <4 x double> %y) {
|
||||||
; CHECK-LABEL: via_minnum:
|
; CHECK-LABEL: via_minnum:
|
||||||
; CHECK: # %bb.0:
|
; CHECK: # %bb.0:
|
||||||
; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm2
|
; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0
|
||||||
; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
|
|
||||||
; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
|
|
||||||
; CHECK-NEXT: retq
|
; CHECK-NEXT: retq
|
||||||
%z = call fast <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
|
%z = call fast <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
|
||||||
ret <4 x double> %z
|
ret <4 x double> %z
|
||||||
@ -19,9 +17,7 @@ define <4 x double> @via_minnum(<4 x double> %x, <4 x double> %y) {
|
|||||||
define <4 x double> @via_maxnum(<4 x double> %x, <4 x double> %y) {
|
define <4 x double> @via_maxnum(<4 x double> %x, <4 x double> %y) {
|
||||||
; CHECK-LABEL: via_maxnum:
|
; CHECK-LABEL: via_maxnum:
|
||||||
; CHECK: # %bb.0:
|
; CHECK: # %bb.0:
|
||||||
; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm2
|
; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
|
||||||
; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
|
|
||||||
; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
|
|
||||||
; CHECK-NEXT: retq
|
; CHECK-NEXT: retq
|
||||||
%z = call fast <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
|
%z = call fast <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
|
||||||
ret <4 x double> %z
|
ret <4 x double> %z
|
||||||
|
Loading…
x
Reference in New Issue
Block a user