mirror of
https://github.com/RPCS3/llvm.git
synced 2025-02-19 10:15:00 +00:00
[x86] Back out one aspect of r311318: don't generically set
FeatureSlowUAMem32. The idea was to mark things that are slow on widely available processors as slow in the generic CPU so that the code generated for that CPU would be fast across those processors. However, for this feature that doesn't work out very well at all. The problem here is that you can very easily enable AVX or AVX2 on top of this generic CPU. For example, this can happen just by using AVX2 intrinsics from Clang within a region of code guarded by a dynamic CPU feature test. When you do that, the generated code with SlowUAMem32 set is ... amazingly slower. The problem is that there really aren't very good alternatives to the unaligned loads, and so our vector codegen regresses significantly. The other issue is that there are plenty of AMD CPUs with AVX1 that don't set FeatureSlowUAMem32 and so we shouldn't just check for AVX2 instead of this special feature. =/ It would be nice to have the target attriute logic be able to enable/disable more than just one feature at a time and control this in a more fine grained and useful way, but that doesn't seem easy. Given that it is only Sandybridge and Ivybridge that set this feature, for now I'm just backing it out of the generic CPU. That has the additional advantage of going back to the previous state that people seemed vaguely happy with. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@311740 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
c2588cb4b1
commit
31f4977889
@ -874,8 +874,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
|
||||
Feature64Bit,
|
||||
FeatureSlow3OpsLEA,
|
||||
FeatureSlowBTMem,
|
||||
FeatureSlowIncDec,
|
||||
FeatureSlowUAMem32
|
||||
FeatureSlowIncDec
|
||||
]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -752,9 +752,7 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
|
||||
; GENERIC-LABEL: test_cvtdq2ps:
|
||||
; GENERIC: # BB#0:
|
||||
; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; GENERIC-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
|
||||
; GENERIC-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:0.50]
|
||||
; GENERIC-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
|
||||
; GENERIC-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [10:1.00]
|
||||
; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; GENERIC-NEXT: retq # sched: [1:1.00]
|
||||
;
|
||||
@ -1956,11 +1954,9 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
|
||||
define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
|
||||
; GENERIC-LABEL: test_movupd:
|
||||
; GENERIC: # BB#0:
|
||||
; GENERIC-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
|
||||
; GENERIC-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
|
||||
; GENERIC-NEXT: vmovupd (%rdi), %ymm0 # sched: [7:0.50]
|
||||
; GENERIC-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; GENERIC-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
|
||||
; GENERIC-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
|
||||
; GENERIC-NEXT: vmovupd %ymm0, (%rsi) # sched: [5:1.00]
|
||||
; GENERIC-NEXT: retq # sched: [1:1.00]
|
||||
;
|
||||
; SANDY-LABEL: test_movupd:
|
||||
@ -2001,11 +1997,9 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
|
||||
define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
|
||||
; GENERIC-LABEL: test_movups:
|
||||
; GENERIC: # BB#0:
|
||||
; GENERIC-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
|
||||
; GENERIC-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
|
||||
; GENERIC-NEXT: vmovups (%rdi), %ymm0 # sched: [7:0.50]
|
||||
; GENERIC-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; GENERIC-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
|
||||
; GENERIC-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
|
||||
; GENERIC-NEXT: vmovups %ymm0, (%rsi) # sched: [5:1.00]
|
||||
; GENERIC-NEXT: retq # sched: [1:1.00]
|
||||
;
|
||||
; SANDY-LABEL: test_movups:
|
||||
|
@ -299,8 +299,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
|
||||
define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
|
||||
; ALL-LABEL: shuffle_v16f32_extract_256:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vmovups 32(%rsi), %xmm0
|
||||
; ALL-NEXT: vinsertf128 $1, 48(%rsi), %ymm0, %ymm0
|
||||
; ALL-NEXT: vmovups 32(%rsi), %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%ptr_a = bitcast float* %a to <16 x float>*
|
||||
%v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
|
||||
|
Loading…
x
Reference in New Issue
Block a user