mirror of
https://github.com/RPCS3/llvm.git
synced 2024-11-27 05:30:29 +00:00
[X86][SSE] LowerUINT_TO_FP_i64 - only use HADDPD for size/fast-hops
We were always generating a single source HADDPD, but really we should only do this if shouldUseHorizontalOp says its a good idea. Differential Revision: https://reviews.llvm.org/D69175 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@375341 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0b2777c548
commit
39580be1ee
@ -18510,6 +18510,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
|
||||
return Result;
|
||||
}
|
||||
|
||||
/// Horizontal vector math instructions may be slower than normal math with
|
||||
/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
|
||||
/// implementation, and likely shuffle complexity of the alternate sequence.
|
||||
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget) {
|
||||
bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
|
||||
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
|
||||
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
|
||||
}
|
||||
|
||||
/// 64-bit unsigned integer to double expansion.
|
||||
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget) {
|
||||
@ -18564,8 +18574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
|
||||
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
|
||||
SDValue Result;
|
||||
|
||||
if (Subtarget.hasSSE3()) {
|
||||
// FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
|
||||
if (shouldUseHorizontalOp(true, DAG, Subtarget)) {
|
||||
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
|
||||
} else {
|
||||
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
|
||||
@ -19623,16 +19632,6 @@ static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
|
||||
return Op;
|
||||
}
|
||||
|
||||
/// Horizontal vector math instructions may be slower than normal math with
|
||||
/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
|
||||
/// implementation, and likely shuffle complexity of the alternate sequence.
|
||||
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget) {
|
||||
bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
|
||||
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
|
||||
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
|
||||
}
|
||||
|
||||
/// Depending on uarch and/or optimizing for size, we might prefer to use a
|
||||
/// vector operation in place of the typical scalar operation.
|
||||
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
|
||||
|
@ -1841,7 +1841,8 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
|
||||
; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
|
||||
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
|
||||
; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
|
||||
; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
|
@ -1,8 +1,10 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
|
||||
|
||||
define float @pr26491(<4 x float> %a0) {
|
||||
; SSE2-LABEL: pr26491:
|
||||
@ -58,37 +60,68 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
|
||||
; SSE2-NEXT: addpd %xmm2, %xmm1
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: PR41414:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: movq %rdi, %xmm2
|
||||
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; SSSE3-NEXT: subpd {{.*}}(%rip), %xmm2
|
||||
; SSSE3-NEXT: haddpd %xmm2, %xmm2
|
||||
; SSSE3-NEXT: divpd %xmm2, %xmm1
|
||||
; SSSE3-NEXT: divpd %xmm2, %xmm0
|
||||
; SSSE3-NEXT: xorpd %xmm2, %xmm2
|
||||
; SSSE3-NEXT: addpd %xmm2, %xmm0
|
||||
; SSSE3-NEXT: addpd %xmm2, %xmm1
|
||||
; SSSE3-NEXT: retq
|
||||
; SSSE3-SLOW-LABEL: PR41414:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: movq %rdi, %xmm2
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2
|
||||
; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
|
||||
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0]
|
||||
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0
|
||||
; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2
|
||||
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0
|
||||
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: PR41414:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovq %rdi, %xmm1
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
; SSSE3-FAST-LABEL: PR41414:
|
||||
; SSSE3-FAST: # %bb.0:
|
||||
; SSSE3-FAST-NEXT: movq %rdi, %xmm2
|
||||
; SSSE3-FAST-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; SSSE3-FAST-NEXT: subpd {{.*}}(%rip), %xmm2
|
||||
; SSSE3-FAST-NEXT: haddpd %xmm2, %xmm2
|
||||
; SSSE3-FAST-NEXT: divpd %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: divpd %xmm2, %xmm0
|
||||
; SSSE3-FAST-NEXT: xorpd %xmm2, %xmm2
|
||||
; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0
|
||||
; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: PR41414:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
|
||||
; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: PR41414:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vmovq %rdi, %xmm1
|
||||
; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: PR41414:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vmovq %rdi, %xmm1
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
|
||||
; AVX2-NEXT: vdivpd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
|
@ -9,7 +9,8 @@ define <4 x double> @PR43402(i64 %x) {
|
||||
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; CHECK-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; CHECK-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
|
||||
; CHECK-NEXT: retl
|
||||
%conv = uitofp i64 %x to double
|
||||
|
@ -610,8 +610,9 @@ define double @u64_to_d(i64 %a) nounwind {
|
||||
; AVX512F_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX512F_32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
|
||||
; AVX512F_32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; AVX512F_32-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
|
||||
; AVX512F_32-NEXT: vmovlpd %xmm0, (%esp)
|
||||
; AVX512F_32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; AVX512F_32-NEXT: vaddsd %xmm0, %xmm1, %xmm0
|
||||
; AVX512F_32-NEXT: vmovsd %xmm0, (%esp)
|
||||
; AVX512F_32-NEXT: fldl (%esp)
|
||||
; AVX512F_32-NEXT: movl %ebp, %esp
|
||||
; AVX512F_32-NEXT: popl %ebp
|
||||
|
Loading…
Reference in New Issue
Block a user