mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-23 20:57:21 +00:00
Enable FeatureFastUAMem for btver2
Allow unaligned 16-byte memop codegen for btver2. No functional changes for any other subtargets. Replace the existing supposed small memcpy test with an actual test of a small memcpy. The previous test wasn't using FileCheck either. This patch should allow us to close PR21541 ( http://llvm.org/bugs/show_bug.cgi?id=21541 ). Differential Revision: http://reviews.llvm.org/D6360 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222925 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f716393db9
commit
c5992119fc
@ -79,6 +79,10 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
|
||||
"Bit testing of memory is slow">;
|
||||
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
|
||||
"SHLD instruction is slow">;
|
||||
// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that
|
||||
// explicit. Also, it seems this would be the default state for most chips
|
||||
// going forward, so it would probably be better to negate the logic and
|
||||
// match the 32-byte "slow mem" feature below.
|
||||
def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
|
||||
"IsUAMemFast", "true",
|
||||
"Fast unaligned memory access">;
|
||||
@ -361,8 +365,10 @@ def : ProcessorModel<"btver2", BtVer2Model,
|
||||
[FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
|
||||
FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
|
||||
FeatureBMI, FeatureF16C, FeatureMOVBE,
|
||||
FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
|
||||
FeatureUseSqrtEst, FeatureUseRecipEst]>;
|
||||
FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
|
||||
FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
|
||||
|
||||
// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
|
||||
|
||||
// Bulldozer
|
||||
def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
|
||||
|
@ -1,20 +1,25 @@
|
||||
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movsd | count 8
|
||||
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
|
||||
|
||||
define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind {
|
||||
entry:
|
||||
%iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
|
||||
%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; <x86_fp80*> [#uses=1]
|
||||
%tmp2 = load x86_fp80* %tmp1, align 16 ; <x86_fp80> [#uses=1]
|
||||
%tmp3 = fsub x86_fp80 0xK80000000000000000000, %tmp2 ; <x86_fp80> [#uses=1]
|
||||
%tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1 ; <x86_fp80*> [#uses=1]
|
||||
%real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0 ; <x86_fp80*> [#uses=1]
|
||||
%tmp6 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0 ; <x86_fp80*> [#uses=1]
|
||||
%tmp7 = load x86_fp80* %tmp6, align 16 ; <x86_fp80> [#uses=1]
|
||||
store x86_fp80 %tmp3, x86_fp80* %real, align 16
|
||||
store x86_fp80 %tmp7, x86_fp80* %tmp4, align 16
|
||||
call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %iz ) nounwind
|
||||
ret void
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
|
||||
|
||||
define void @copy16bytes(i8* nocapture %a, i8* nocapture readonly %b) {
|
||||
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 1, i1 false)
|
||||
ret void
|
||||
|
||||
; CHECK-LABEL: copy16bytes
|
||||
; CORE2: movq
|
||||
; CORE2-NEXT: movq
|
||||
; CORE2-NEXT: movq
|
||||
; CORE2-NEXT: movq
|
||||
; CORE2-NEXT: retq
|
||||
|
||||
; NEHALEM: movups
|
||||
; NEHALEM-NEXT: movups
|
||||
; NEHALEM-NEXT: retq
|
||||
|
||||
; BTVER2: movups
|
||||
; BTVER2-NEXT: movups
|
||||
; BTVER2-NEXT: retq
|
||||
}
|
||||
|
||||
declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval align 4 ) nounwind
|
||||
|
Loading…
x
Reference in New Issue
Block a user