mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-17 23:44:43 +00:00
X86: Enable SSE memory intrinsics even when stack alignment is less than 16 bytes.
The stack realignment code was fixed to work when there is stack realignment and a dynamic alloca is present so this shouldn't cause correctness issues anymore. Note that this also enables generation of AVX instructions for memset under the assumptions: - Unaligned loads/stores are always fast on CPUs supporting AVX - AVX is not slower than SSE We may need some tweaked heuristics if one of those assumptions turns out not to be true. Effectively reverts r58317. Part of PR2962. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167967 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
97d19ebe5b
commit
2dbe929685
@ -1362,18 +1362,14 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
|
||||
bool IsZeroVal,
|
||||
bool MemcpyStrSrc,
|
||||
MachineFunction &MF) const {
|
||||
// FIXME: This turns off use of xmm stores for memset/memcpy on targets like
|
||||
// linux. This is because the stack realignment code can't handle certain
|
||||
// cases like PR2962. This should be removed when PR2962 is fixed.
|
||||
const Function *F = MF.getFunction();
|
||||
if (IsZeroVal &&
|
||||
!F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
|
||||
if (Size >= 16 &&
|
||||
(Subtarget->isUnalignedMemAccessFast() ||
|
||||
((DstAlign == 0 || DstAlign >= 16) &&
|
||||
(SrcAlign == 0 || SrcAlign >= 16))) &&
|
||||
Subtarget->getStackAlignment() >= 16) {
|
||||
if (Subtarget->getStackAlignment() >= 32) {
|
||||
(SrcAlign == 0 || SrcAlign >= 16)))) {
|
||||
if (Size >= 32) {
|
||||
if (Subtarget->hasAVX2())
|
||||
return MVT::v8i32;
|
||||
if (Subtarget->hasAVX())
|
||||
@ -1385,7 +1381,6 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
|
||||
return MVT::v4f32;
|
||||
} else if (!MemcpyStrSrc && Size >= 8 &&
|
||||
!Subtarget->is64Bit() &&
|
||||
Subtarget->getStackAlignment() >= 8 &&
|
||||
Subtarget->hasSSE2()) {
|
||||
// Do not use f64 to lower memcpy if source is string constant. It's
|
||||
// better to use i32 to avoid the loads.
|
||||
|
@ -1,22 +0,0 @@
|
||||
; Linux doesn't support stack realignment for functions with allocas (PR2888).
|
||||
; Until it does, we shouldn't use movaps to access the stack. On targets with
|
||||
; sufficiently aligned stack (e.g. darwin) we should.
|
||||
; PR8969 - make 32-bit linux have a 16-byte aligned stack
|
||||
; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mcpu=yonah | grep movaps | count 2
|
||||
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=yonah | grep movaps | count 2
|
||||
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
|
||||
target triple = "i386-pc-linux-gnu"
|
||||
|
||||
define void @foo(i32 %t) nounwind {
|
||||
%tmp1210 = alloca i8, i32 32, align 4
|
||||
call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
|
||||
%x = alloca i8, i32 %t
|
||||
call void @dummy(i8* %x)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @dummy(i8*)
|
||||
|
||||
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
|
@ -1,4 +1,5 @@
|
||||
; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
|
||||
; RUN: llc < %s -mattr=+sse2 -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2
|
||||
; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
|
||||
; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
|
||||
|
77
test/CodeGen/X86/memset-sse-stack-realignment.ll
Normal file
77
test/CodeGen/X86/memset-sse-stack-realignment.ll
Normal file
@ -0,0 +1,77 @@
|
||||
; Make sure that we realign the stack. Mingw32 uses 4 byte stack alignment, we
|
||||
; need 16 bytes for SSE and 32 bytes for AVX.
|
||||
|
||||
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium2 | FileCheck %s -check-prefix=NOSSE
|
||||
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium3 | FileCheck %s -check-prefix=SSE1
|
||||
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=yonah | FileCheck %s -check-prefix=SSE2
|
||||
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX1
|
||||
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2
|
||||
|
||||
define void @test1(i32 %t) nounwind {
|
||||
%tmp1210 = alloca i8, i32 32, align 4
|
||||
call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
|
||||
%x = alloca i8, i32 %t
|
||||
call void @dummy(i8* %x)
|
||||
ret void
|
||||
|
||||
; NOSSE: test1:
|
||||
; NOSSE-NOT: and
|
||||
; NOSSE: movl $0
|
||||
|
||||
; SSE1: test1:
|
||||
; SSE1: andl $-16
|
||||
; SSE1: movl %esp, %esi
|
||||
; SSE1: movaps
|
||||
|
||||
; SSE2: test1:
|
||||
; SSE2: andl $-16
|
||||
; SSE2: movl %esp, %esi
|
||||
; SSE2: movaps
|
||||
|
||||
; AVX1: test1:
|
||||
; AVX1: andl $-32
|
||||
; AVX1: movl %esp, %esi
|
||||
; AVX1: vmovaps %ymm
|
||||
|
||||
; AVX2: test1:
|
||||
; AVX2: andl $-32
|
||||
; AVX2: movl %esp, %esi
|
||||
; AVX2: vmovaps %ymm
|
||||
|
||||
}
|
||||
|
||||
define void @test2(i32 %t) nounwind {
|
||||
%tmp1210 = alloca i8, i32 16, align 4
|
||||
call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 16, i32 4, i1 false)
|
||||
%x = alloca i8, i32 %t
|
||||
call void @dummy(i8* %x)
|
||||
ret void
|
||||
|
||||
; NOSSE: test2:
|
||||
; NOSSE-NOT: and
|
||||
; NOSSE: movl $0
|
||||
|
||||
; SSE1: test2:
|
||||
; SSE1: andl $-16
|
||||
; SSE1: movl %esp, %esi
|
||||
; SSE1: movaps
|
||||
|
||||
; SSE2: test2:
|
||||
; SSE2: andl $-16
|
||||
; SSE2: movl %esp, %esi
|
||||
; SSE2: movaps
|
||||
|
||||
; AVX1: test2:
|
||||
; AVX1: andl $-16
|
||||
; AVX1: movl %esp, %esi
|
||||
; AVX1: vmovaps %xmm
|
||||
|
||||
; AVX2: test2:
|
||||
; AVX2: andl $-16
|
||||
; AVX2: movl %esp, %esi
|
||||
; AVX2: vmovaps %xmm
|
||||
}
|
||||
|
||||
declare void @dummy(i8*)
|
||||
|
||||
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
|
@ -1,5 +1,6 @@
|
||||
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
|
||||
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
|
||||
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core2 | grep movl | count 20
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10
|
||||
|
||||
define void @bork() nounwind {
|
||||
|
Loading…
x
Reference in New Issue
Block a user