mirror of
https://github.com/RPCS3/llvm.git
synced 2024-11-27 21:50:29 +00:00
f48b1beeec
This fixes two issues in x86 fptoui lowering. 1) Makes conversions from f80 go through the right path on AVX-512. 2) Implements an inline sequence for fptoui i64 instead of a library call. This improves performance by 6X on SSE3+ and 3X otherwise. Incidentally, it also removes the use of ftol2 for fptoui, which was wrong to begin with, as ftol2 converts to a signed i64, producing wrong results for values >= 2^63. Patch by: mitch.l.bodart@intel.com Differential Revision: http://reviews.llvm.org/D11316 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245924 91177308-0d34-0410-b5e6-96231b3b80d8
35 lines
812 B
LLVM
35 lines
812 B
LLVM
; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
|
|
|
|
%struct_type = type { [64 x <8 x float>], <8 x float> }
|
|
|
|
; Function Attrs: nounwind readnone
|
|
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
|
|
|
|
; Function Attrs: nounwind
|
|
define i32 @equal(<8 x i32> %A) {
|
|
allocas:
|
|
%first_alloc = alloca [64 x <8 x i32>]
|
|
%second_alloc = alloca %struct_type
|
|
|
|
%A1 = bitcast <8 x i32> %A to <8 x float>
|
|
%A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
|
|
ret i32 %A2
|
|
}
|
|
|
|
; CHECK: equal
|
|
; CHECK-NOT: vzeroupper
|
|
; CHECK: _chkstk
|
|
; CHECK: ret
|
|
|
|
define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
|
|
%i = fptoui double %x to i64
|
|
store i64 %i, i64* %p
|
|
%ret = fadd <8 x float> %y, %y
|
|
ret <8 x float> %ret
|
|
}
|
|
|
|
; CHECK: foo
|
|
; CHECK-NOT: vzeroupper
|
|
; CHECK: {{cvtt|fist}}
|
|
; CHECK: ret
|