[Legalize][X86] Improve nnan fmin/fmax vector reduction

Use +/-Inf or +/-Largest as neutral element for nnan fmin/fmax
reductions. This avoids dropping any FMF flags. Preserving the
nnan flag in particular is important to get a good lowering on X86.

Differential Revision: https://reviews.llvm.org/D87586
This commit is contained in:
Nikita Popov 2020-09-12 23:10:15 +02:00
parent 108cc54563
commit a2179f553d
5 changed files with 50 additions and 148 deletions

View File

@ -4794,20 +4794,18 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
case ISD::VECREDUCE_FMUL:
NeutralElem = DAG.getConstantFP(1.0, dl, ElemVT);
break;
case ISD::VECREDUCE_FMAX:
// This has maxnum semantics, so NaN represents missing data. We must clear
// 'nnan' if it was set because the NaN would be a poison value.
NeutralElem = DAG.getConstantFP(
std::numeric_limits<double>::quiet_NaN(), dl, ElemVT);
Flags.setNoNaNs(false);
break;
case ISD::VECREDUCE_FMIN:
// This has minnum semantics, so NaN represents missing data. We must clear
// 'nnan' if it was set because the NaN would be a poison value.
NeutralElem = DAG.getConstantFP(
std::numeric_limits<double>::quiet_NaN(), dl, ElemVT);
Flags.setNoNaNs(false);
break;
case ISD::VECREDUCE_FMAX: {
// Neutral element for fminnum is NaN, Inf or FLT_MAX, depending on FMF.
const fltSemantics &Semantics = DAG.EVTToAPFloatSemantics(ElemVT);
APFloat NeutralAF = !Flags.hasNoNaNs() ? APFloat::getQNaN(Semantics) :
!Flags.hasNoInfs() ? APFloat::getInf(Semantics) :
APFloat::getLargest(Semantics);
if (N->getOpcode() == ISD::VECREDUCE_FMAX)
NeutralAF.changeSign();
NeutralElem = DAG.getConstantFP(NeutralAF, dl, ElemVT);
}
}
// Pad the vector with the neutral element.

View File

@ -47,7 +47,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
define float @test_v3f32(<3 x float> %a) nounwind {
; CHECK-LABEL: test_v3f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #2143289344
; CHECK-NEXT: mov w8, #-8388608
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: fmaxnmv s0, v0.4s
@ -59,7 +59,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
define float @test_v3f32_ninf(<3 x float> %a) nounwind {
; CHECK-LABEL: test_v3f32_ninf:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #2143289344
; CHECK-NEXT: mov w8, #-8388609
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: fmaxnmv s0, v0.4s

View File

@ -47,7 +47,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
define float @test_v3f32(<3 x float> %a) nounwind {
; CHECK-LABEL: test_v3f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #2143289344
; CHECK-NEXT: mov w8, #2139095040
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: fminnmv s0, v0.4s
@ -59,7 +59,7 @@ define float @test_v3f32(<3 x float> %a) nounwind {
define float @test_v3f32_ninf(<3 x float> %a) nounwind {
; CHECK-LABEL: test_v3f32_ninf:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #2143289344
; CHECK-NEXT: mov w8, #2139095039
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: fminnmv s0, v0.4s

View File

@ -13,46 +13,27 @@
define float @test_v2f32(<2 x float> %a0) {
; SSE2-LABEL: test_v2f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: cmpunordss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm3
; SSE2-NEXT: andps %xmm2, %xmm3
; SSE2-NEXT: maxss %xmm0, %xmm2
; SSE2-NEXT: andnps %xmm2, %xmm1
; SSE2-NEXT: orps %xmm3, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: maxss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: cmpunordss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm3
; SSE41-NEXT: andps %xmm2, %xmm3
; SSE41-NEXT: maxss %xmm0, %xmm2
; SSE41-NEXT: andnps %xmm2, %xmm1
; SSE41-NEXT: orps %xmm3, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: maxss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1
; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call nnan float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0)
ret float %1
@ -302,65 +283,37 @@ define double @test_v3f64(<3 x double> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1]
; SSE2-NEXT: movapd %xmm2, %xmm1
; SSE2-NEXT: maxpd %xmm0, %xmm1
; SSE2-NEXT: cmpunordpd %xmm0, %xmm0
; SSE2-NEXT: andpd %xmm0, %xmm2
; SSE2-NEXT: andnpd %xmm1, %xmm0
; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE2-NEXT: maxpd %xmm2, %xmm0
; SSE2-NEXT: movapd %xmm0, %xmm1
; SSE2-NEXT: cmpunordsd %xmm0, %xmm1
; SSE2-NEXT: movapd %xmm1, %xmm3
; SSE2-NEXT: andpd %xmm2, %xmm3
; SSE2-NEXT: maxsd %xmm0, %xmm2
; SSE2-NEXT: andnpd %xmm2, %xmm1
; SSE2-NEXT: orpd %xmm3, %xmm1
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE2-NEXT: maxsd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v3f64:
; SSE41: # %bb.0:
; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1]
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: maxpd %xmm0, %xmm1
; SSE41-NEXT: cmpunordpd %xmm0, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm0, %xmm3
; SSE41-NEXT: andpd %xmm2, %xmm3
; SSE41-NEXT: maxsd %xmm1, %xmm2
; SSE41-NEXT: andnpd %xmm2, %xmm0
; SSE41-NEXT: orpd %xmm3, %xmm0
; SSE41-NEXT: maxpd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: maxsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v3f64:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3
; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v3f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call nnan double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double> %a0)

View File

@ -21,46 +21,27 @@ define float @test_v1f32(<1 x float> %a0) {
define float @test_v2f32(<2 x float> %a0) {
; SSE2-LABEL: test_v2f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: cmpunordss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm3
; SSE2-NEXT: andps %xmm2, %xmm3
; SSE2-NEXT: minss %xmm0, %xmm2
; SSE2-NEXT: andnps %xmm2, %xmm1
; SSE2-NEXT: orps %xmm3, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: minss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: cmpunordss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm3
; SSE41-NEXT: andps %xmm2, %xmm3
; SSE41-NEXT: minss %xmm0, %xmm2
; SSE41-NEXT: andnps %xmm2, %xmm1
; SSE41-NEXT: orps %xmm3, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: minss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1
; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call nnan float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
ret float %1
@ -72,20 +53,9 @@ define float @test_v3f32(<3 x float> %a0) {
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: cmpunordss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm3
; SSE2-NEXT: andps %xmm2, %xmm3
; SSE2-NEXT: minss %xmm0, %xmm2
; SSE2-NEXT: andnps %xmm2, %xmm1
; SSE2-NEXT: orps %xmm3, %xmm1
; SSE2-NEXT: minss %xmm2, %xmm1
; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: minss %xmm1, %xmm2
; SSE2-NEXT: cmpunordss %xmm1, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm3
; SSE2-NEXT: andnps %xmm2, %xmm3
; SSE2-NEXT: andps %xmm0, %xmm1
; SSE2-NEXT: orps %xmm3, %xmm1
; SSE2-NEXT: minss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
@ -93,45 +63,26 @@ define float @test_v3f32(<3 x float> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: cmpunordss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm3
; SSE41-NEXT: andps %xmm2, %xmm3
; SSE41-NEXT: minss %xmm0, %xmm2
; SSE41-NEXT: andnps %xmm2, %xmm1
; SSE41-NEXT: orps %xmm3, %xmm1
; SSE41-NEXT: minss %xmm2, %xmm1
; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE41-NEXT: movaps %xmm0, %xmm2
; SSE41-NEXT: minss %xmm1, %xmm2
; SSE41-NEXT: cmpunordss %xmm1, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm3
; SSE41-NEXT: andnps %xmm2, %xmm3
; SSE41-NEXT: andps %xmm0, %xmm1
; SSE41-NEXT: orps %xmm3, %xmm1
; SSE41-NEXT: minss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v3f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3
; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v3f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm2
; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vminss %xmm2, %xmm1, %xmm0
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a0)
ret float %1