[x86] Back out one aspect of r311318: don't generically set

FeatureSlowUAMem32. The idea was to mark things that are slow on widely available processors as slow in the generic CPU so that the code generated for that CPU would be fast across those processors. However, for this feature that doesn't work out very well at all. The problem here is that you can very easily enable AVX or AVX2 on top of this generic CPU. For example, this can happen just by using AVX2 intrinsics from Clang within a region of code guarded by a dynamic CPU feature test. When you do that, the generated code with SlowUAMem32 set is ... amazingly slower. The problem is that there really aren't very good alternatives to the unaligned loads, and so our vector codegen regresses significantly. The other issue is that there are plenty of AMD CPUs with AVX1 that don't set FeatureSlowUAMem32 and so we shouldn't just check for AVX2 instead of this special feature. =/ It would be nice to have the target attriute logic be able to enable/disable more than just one feature at a time and control this in a more fine grained and useful way, but that doesn't seem easy. Given that it is only Sandybridge and Ivybridge that set this feature, for now I'm just backing it out of the generic CPU. That has the additional advantage of going back to the previous state that people seemed vaguely happy with. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@311740 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-19 10:15:00 +00:00 · 2017-08-25 00:56:05 +00:00 · 2017-08-25 00:56:05 +00:00 · 31f4977889
commit 31f4977889
parent c2588cb4b1
3 changed files with 7 additions and 15 deletions
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@ -874,8 +874,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
  Feature64Bit,
  FeatureSlow3OpsLEA,
  FeatureSlowBTMem,
-  FeatureSlowIncDec,
-  FeatureSlowUAMem32
+  FeatureSlowIncDec
 ]>;

 //===----------------------------------------------------------------------===//
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@ -752,9 +752,7 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
 ; GENERIC-LABEL: test_cvtdq2ps:
 ; GENERIC:       # BB#0:
 ; GENERIC-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [10:1.00]
 ; GENERIC-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@ -1956,11 +1954,9 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
 define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ; GENERIC-LABEL: test_movupd:
 ; GENERIC:       # BB#0:
-; GENERIC-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT:    vmovupd (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
-; GENERIC-NEXT:    vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    vmovupd %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movupd:
@ -2001,11 +1997,9 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ; GENERIC-LABEL: test_movups:
 ; GENERIC:       # BB#0:
-; GENERIC-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
-; GENERIC-NEXT:    vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    vmovups %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movups:
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@ -299,8 +299,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
 define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
 ; ALL-LABEL: shuffle_v16f32_extract_256:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vmovups 32(%rsi), %xmm0
-; ALL-NEXT:    vinsertf128 $1, 48(%rsi), %ymm0, %ymm0
+; ALL-NEXT:    vmovups 32(%rsi), %ymm0
 ; ALL-NEXT:    retq
  %ptr_a = bitcast float* %a to <16 x float>*
  %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4