[X86][SSE] LowerUINT_TO_FP_i64 - only use HADDPD for size/fast-hops

We were always generating a single source HADDPD, but really we should only do this if shouldUseHorizontalOp says its a good idea.

Differential Revision: https://reviews.llvm.org/D69175

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@375341 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Simon Pilgrim 2019-10-19 11:53:48 +00:00
parent 0b2777c548
commit 39580be1ee
5 changed files with 79 additions and 44 deletions

View File

@ -18510,6 +18510,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
return Result;
}
/// Horizontal vector math instructions may be slower than normal math with
/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@ -18564,8 +18574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
if (Subtarget.hasSSE3()) {
// FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
if (shouldUseHorizontalOp(true, DAG, Subtarget)) {
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
@ -19623,16 +19632,6 @@ static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
return Op;
}
/// Horizontal vector math instructions may be slower than normal math with
/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

View File

@ -1841,7 +1841,8 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-NEXT: retl
;

View File

@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
define float @pr26491(<4 x float> %a0) {
; SSE2-LABEL: pr26491:
@ -58,37 +60,68 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
; SSE2-NEXT: addpd %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR41414:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; SSSE3-NEXT: subpd {{.*}}(%rip), %xmm2
; SSSE3-NEXT: haddpd %xmm2, %xmm2
; SSSE3-NEXT: divpd %xmm2, %xmm1
; SSSE3-NEXT: divpd %xmm2, %xmm0
; SSSE3-NEXT: xorpd %xmm2, %xmm2
; SSSE3-NEXT: addpd %xmm2, %xmm0
; SSSE3-NEXT: addpd %xmm2, %xmm1
; SSSE3-NEXT: retq
; SSSE3-SLOW-LABEL: PR41414:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movq %rdi, %xmm2
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2
; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3
; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0]
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0
; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1
; SSSE3-SLOW-NEXT: retq
;
; AVX1-LABEL: PR41414:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm1
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
; SSSE3-FAST-LABEL: PR41414:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: movq %rdi, %xmm2
; SSSE3-FAST-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; SSSE3-FAST-NEXT: subpd {{.*}}(%rip), %xmm2
; SSSE3-FAST-NEXT: haddpd %xmm2, %xmm2
; SSSE3-FAST-NEXT: divpd %xmm2, %xmm1
; SSSE3-FAST-NEXT: divpd %xmm2, %xmm0
; SSSE3-FAST-NEXT: xorpd %xmm2, %xmm2
; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0
; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR41414:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1
; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: PR41414:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vmovq %rdi, %xmm1
; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: PR41414:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1

View File

@ -9,7 +9,8 @@ define <4 x double> @PR43402(i64 %x) {
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
; CHECK-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
; CHECK-NEXT: retl
%conv = uitofp i64 %x to double

View File

@ -610,8 +610,9 @@ define double @u64_to_d(i64 %a) nounwind {
; AVX512F_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512F_32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512F_32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512F_32-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX512F_32-NEXT: vmovlpd %xmm0, (%esp)
; AVX512F_32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512F_32-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512F_32-NEXT: vmovsd %xmm0, (%esp)
; AVX512F_32-NEXT: fldl (%esp)
; AVX512F_32-NEXT: movl %ebp, %esp
; AVX512F_32-NEXT: popl %ebp