mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-03 01:12:53 +00:00
[X86] Enable the post-RA-scheduler for clang's default 32-bit cpu.
For compilations with no explicit cpu specified, this exhibits nice gains on Silvermont, with neutral performance on big cores. Differential Revision: http://reviews.llvm.org/D19138 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@267809 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0d974f894f
commit
3fdd252bbd
@ -276,12 +276,28 @@ def : Proc<"pentium3", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE1, FeatureFXSR]>;
|
||||
def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>;
|
||||
def : Proc<"pentium-m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
|
||||
def : Proc<"pentium4", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE2, FeatureFXSR]>;
|
||||
def : Proc<"pentium4m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
|
||||
|
||||
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
|
||||
// The intent is to enable it for pentium4 which is the current default
|
||||
// processor in a vanilla 32-bit clang compilation when no specific
|
||||
// architecture is specified. This generally gives a nice performance
|
||||
// increase on silvermont, with largely neutral behavior on other
|
||||
// contemporary large core processors.
|
||||
// pentium-m, pentium4m, prescott and nocona are included as a preventative
|
||||
// measure to avoid performance surprises, in case clang's default cpu
|
||||
// changes slightly.
|
||||
|
||||
def : ProcessorModel<"pentium-m", GenericPostRAModel,
|
||||
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
|
||||
|
||||
def : ProcessorModel<"pentium4", GenericPostRAModel,
|
||||
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE2, FeatureFXSR]>;
|
||||
|
||||
def : ProcessorModel<"pentium4m", GenericPostRAModel,
|
||||
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
|
||||
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
|
||||
|
||||
// Intel Quark.
|
||||
def : Proc<"lakemont", []>;
|
||||
@ -292,10 +308,10 @@ def : ProcessorModel<"yonah", SandyBridgeModel,
|
||||
FeatureFXSR, FeatureSlowBTMem]>;
|
||||
|
||||
// NetBurst.
|
||||
def : Proc<"prescott",
|
||||
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
|
||||
FeatureFXSR, FeatureSlowBTMem]>;
|
||||
def : Proc<"nocona", [
|
||||
def : ProcessorModel<"prescott", GenericPostRAModel,
|
||||
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
|
||||
FeatureFXSR, FeatureSlowBTMem]>;
|
||||
def : ProcessorModel<"nocona", GenericPostRAModel, [
|
||||
FeatureX87,
|
||||
FeatureSlowUAMem16,
|
||||
FeatureMMX,
|
||||
|
@ -633,8 +633,9 @@ def IIC_NOP : InstrItinClass;
|
||||
// latencies. Since these latencies are not used for pipeline hazards,
|
||||
// they do not need to be exact.
|
||||
//
|
||||
// The GenericModel contains no instruction itineraries.
|
||||
def GenericModel : SchedMachineModel {
|
||||
// The GenericX86Model contains no instruction itineraries
|
||||
// and disables PostRAScheduler.
|
||||
class GenericX86Model : SchedMachineModel {
|
||||
let IssueWidth = 4;
|
||||
let MicroOpBufferSize = 32;
|
||||
let LoadLatency = 4;
|
||||
@ -643,6 +644,13 @@ def GenericModel : SchedMachineModel {
|
||||
let CompleteModel = 0;
|
||||
}
|
||||
|
||||
def GenericModel : GenericX86Model;
|
||||
|
||||
// Define a model with the PostRAScheduler enabled.
|
||||
def GenericPostRAModel : GenericX86Model {
|
||||
let PostRAScheduler = 1;
|
||||
}
|
||||
|
||||
include "X86ScheduleAtom.td"
|
||||
include "X86SchedSandyBridge.td"
|
||||
include "X86SchedHaswell.td"
|
||||
|
40
test/CodeGen/X86/post-ra-sched.ll
Normal file
40
test/CodeGen/X86/post-ra-sched.ll
Normal file
@ -0,0 +1,40 @@
|
||||
; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s
|
||||
;
|
||||
; Verify that scheduling puts some distance between a load feeding into
|
||||
; the address of another load, and that second load. This currently
|
||||
; happens during the post-RA-scheduler, which should be enabled by
|
||||
; default with the above specified cpus.
|
||||
|
||||
@ptrs = external global [0 x i32*], align 4
|
||||
@idxa = common global i32 0, align 4
|
||||
@idxb = common global i32 0, align 4
|
||||
@res = common global i32 0, align 4
|
||||
|
||||
define void @addindirect() {
|
||||
; CHECK-LABEL: addindirect:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: movl idxb, %ecx
|
||||
; CHECK-NEXT: movl idxa, %eax
|
||||
; CHECK-NEXT: movl ptrs(,%ecx,4), %ecx
|
||||
; CHECK-NEXT: movl ptrs(,%eax,4), %eax
|
||||
; CHECK-NEXT: movl (%ecx), %ecx
|
||||
; CHECK-NEXT: addl (%eax), %ecx
|
||||
; CHECK-NEXT: movl %ecx, res
|
||||
; CHECK-NEXT: retl
|
||||
entry:
|
||||
%0 = load i32, i32* @idxa, align 4
|
||||
%arrayidx = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %0
|
||||
%1 = load i32*, i32** %arrayidx, align 4
|
||||
%2 = load i32, i32* %1, align 4
|
||||
%3 = load i32, i32* @idxb, align 4
|
||||
%arrayidx1 = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %3
|
||||
%4 = load i32*, i32** %arrayidx1, align 4
|
||||
%5 = load i32, i32* %4, align 4
|
||||
%add = add i32 %5, %2
|
||||
store i32 %add, i32* @res, align 4
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user