mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-08 13:00:43 +00:00
17c4ba4fe4
On modern Intel processors hardware SQRT in many cases is faster than RSQRT followed by Newton-Raphson refinement. The patch introduces a simple heuristic to choose between hardware SQRT instruction and Newton-Raphson software estimation. The patch treats scalars and vectors differently. The heuristic is that for scalars the compiler should optimize for latency while for vectors it should optimize for throughput. It is based on the assumption that throughput bound code is likely to be vectorized. Basically, the patch disables scalar NR for big cores and disables NR completely for Skylake. Firstly, scalar SQRT has shorter latency than NR code in big cores. Secondly, vector SQRT has been greatly improved in Skylake and has better throughput compared to NR. Differential Revision: https://reviews.llvm.org/D21379 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277725 91177308-0d34-0410-b5e6-96231b3b80d8
58 lines
2.2 KiB
LLVM
58 lines
2.2 KiB
LLVM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC
|
|
|
|
declare float @llvm.sqrt.f32(float) #0
|
|
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
|
|
declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
|
|
|
|
define float @foo_x1(float %f) #0 {
|
|
; SCALAR-EST-LABEL: foo_x1:
|
|
; SCALAR-EST: # BB#0:
|
|
; SCALAR-EST-NEXT: rsqrtss %xmm0
|
|
; SCALAR-EST: retq
|
|
;
|
|
; SCALAR-ACC-LABEL: foo_x1:
|
|
; SCALAR-ACC: # BB#0:
|
|
; SCALAR-ACC-NEXT: {{^ *v?sqrtss %xmm0}}
|
|
; SCALAR-ACC-NEXT: retq
|
|
%call = tail call float @llvm.sqrt.f32(float %f) #1
|
|
ret float %call
|
|
}
|
|
|
|
define <4 x float> @foo_x4(<4 x float> %f) #0 {
|
|
; VECTOR-EST-LABEL: foo_x4:
|
|
; VECTOR-EST: # BB#0:
|
|
; VECTOR-EST-NEXT: rsqrtps %xmm0
|
|
; VECTOR-EST: retq
|
|
;
|
|
; VECTOR-ACC-LABEL: foo_x4:
|
|
; VECTOR-ACC: # BB#0:
|
|
; VECTOR-ACC-NEXT: {{^ *v?sqrtps %xmm0}}
|
|
; VECTOR-ACC-NEXT: retq
|
|
%call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
|
|
ret <4 x float> %call
|
|
}
|
|
|
|
define <8 x float> @foo_x8(<8 x float> %f) #0 {
|
|
; VECTOR-EST-LABEL: foo_x8:
|
|
; VECTOR-EST: # BB#0:
|
|
; VECTOR-EST-NEXT: rsqrtps
|
|
; VECTOR-EST: retq
|
|
;
|
|
; VECTOR-ACC-LABEL: foo_x8:
|
|
; VECTOR-ACC: # BB#0:
|
|
; VECTOR-ACC-NEXT: {{^ *v?sqrtps %[xy]mm0}}
|
|
; VECTOR-ACC-NOT: rsqrt
|
|
; VECTOR-ACC: retq
|
|
%call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1
|
|
ret <8 x float> %call
|
|
}
|
|
|
|
attributes #0 = { "unsafe-fp-math"="true" }
|
|
attributes #1 = { nounwind readnone }
|