[X86] Enable the post-RA-scheduler for clang's default 32-bit cpu.

For compilations with no explicit cpu specified, this exhibits nice gains on Silvermont, with neutral performance on big cores. Differential Revision: http://reviews.llvm.org/D19138 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@267809 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-03 01:12:53 +00:00 · 2016-04-27 22:52:35 +00:00 · 2016-04-27 22:52:35 +00:00 · 3fdd252bbd
commit 3fdd252bbd
parent 0d974f894f
3 changed files with 76 additions and 12 deletions
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@ -276,12 +276,28 @@ def : Proc<"pentium3",        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
                               FeatureSSE1, FeatureFXSR]>;
 def : Proc<"pentium3m",       [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
                               FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>;
-def : Proc<"pentium-m",       [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
-                               FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
-def : Proc<"pentium4",        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
-                               FeatureSSE2, FeatureFXSR]>;
-def : Proc<"pentium4m",       [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
-                               FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
+// The intent is to enable it for pentium4 which is the current default
+// processor in a vanilla 32-bit clang compilation when no specific
+// architecture is specified.  This generally gives a nice performance
+// increase on silvermont, with largely neutral behavior on other
+// contemporary large core processors.
+// pentium-m, pentium4m, prescott and nocona are included as a preventative
+// measure to avoid performance surprises, in case clang's default cpu
+// changes slightly.
+
+def : ProcessorModel<"pentium-m", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+def : ProcessorModel<"pentium4", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR]>;
+
+def : ProcessorModel<"pentium4m", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;

 // Intel Quark.
 def : Proc<"lakemont",        []>;
@ -292,10 +308,10 @@ def : ProcessorModel<"yonah", SandyBridgeModel,
                      FeatureFXSR, FeatureSlowBTMem]>;

 // NetBurst.
-def : Proc<"prescott",
-           [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
-            FeatureFXSR, FeatureSlowBTMem]>;
-def : Proc<"nocona", [
+def : ProcessorModel<"prescott", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
+                      FeatureFXSR, FeatureSlowBTMem]>;
+def : ProcessorModel<"nocona", GenericPostRAModel, [
  FeatureX87,
  FeatureSlowUAMem16,
  FeatureMMX,
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@ -633,8 +633,9 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
-// The GenericModel contains no instruction itineraries.
-def GenericModel : SchedMachineModel {
+// The GenericX86Model contains no instruction itineraries
+// and disables PostRAScheduler.
+class GenericX86Model : SchedMachineModel {
  let IssueWidth = 4;
  let MicroOpBufferSize = 32;
  let LoadLatency = 4;
@ -643,6 +644,13 @@ def GenericModel : SchedMachineModel {
  let CompleteModel = 0;
 }

+def GenericModel : GenericX86Model;
+
+// Define a model with the PostRAScheduler enabled.
+def GenericPostRAModel : GenericX86Model {
+  let PostRAScheduler = 1;
+}
+
 include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
--- a/test/CodeGen/X86/post-ra-sched.ll
+++ b/test/CodeGen/X86/post-ra-sched.ll
@ -0,0 +1,40 @@
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s
+;
+; Verify that scheduling puts some distance between a load feeding into
+; the address of another load, and that second load.  This currently
+; happens during the post-RA-scheduler, which should be enabled by
+; default with the above specified cpus.
+
+@ptrs = external global [0 x i32*], align 4
+@idxa = common global i32 0, align 4
+@idxb = common global i32 0, align 4
+@res = common global i32 0, align 4
+
+define void @addindirect() {
+; CHECK-LABEL: addindirect:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl idxb, %ecx
+; CHECK-NEXT:    movl idxa, %eax
+; CHECK-NEXT:    movl ptrs(,%ecx,4), %ecx
+; CHECK-NEXT:    movl ptrs(,%eax,4), %eax
+; CHECK-NEXT:    movl (%ecx), %ecx
+; CHECK-NEXT:    addl (%eax), %ecx
+; CHECK-NEXT:    movl %ecx, res
+; CHECK-NEXT:    retl
+entry:
+  %0 = load i32, i32* @idxa, align 4
+  %arrayidx = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %0
+  %1 = load i32*, i32** %arrayidx, align 4
+  %2 = load i32, i32* %1, align 4
+  %3 = load i32, i32* @idxb, align 4
+  %arrayidx1 = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %3
+  %4 = load i32*, i32** %arrayidx1, align 4
+  %5 = load i32, i32* %4, align 4
+  %add = add i32 %5, %2
+  store i32 %add, i32* @res, align 4
+  ret void
+}