mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-27 21:53:56 +00:00
On recent Intel u-arch's, folding loads into some unary SSE instructions can
be non-optimal. To be precise, we should avoid folding loads if the instructions only update part of the destination register, and the non-updated part is not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these instructions breaks the partial register dependency and it can improve performance. e.g. movss (%rdi), %xmm0 cvtss2sd %xmm0, %xmm0 instead of cvtss2sd (%rdi), %xmm0 An alternative method to break dependency is to clear the register first. e.g. xorps %xmm0, %xmm0 cvtss2sd (%rdi), %xmm0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@91672 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
3a5d409f3c
commit
400073d546
@ -57,6 +57,8 @@ def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
|
||||
"Support 64-bit instructions">;
|
||||
def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
|
||||
"Bit testing of memory is slow">;
|
||||
def FeatureBreakSSEDep : SubtargetFeature<"break-sse-dep", "BreakSSEDep","true",
|
||||
"Should break SSE partial update dep with load / xorps">;
|
||||
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
|
||||
"Support SSE 4a instructions">;
|
||||
|
||||
@ -86,17 +88,27 @@ def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>;
|
||||
def : Proc<"pentium3", [FeatureSSE1]>;
|
||||
def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
|
||||
def : Proc<"pentium4", [FeatureSSE2]>;
|
||||
def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
|
||||
def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>;
|
||||
def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
|
||||
def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>;
|
||||
def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem]>;
|
||||
def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem]>;
|
||||
def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>;
|
||||
def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>;
|
||||
def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>;
|
||||
def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
|
||||
FeatureBreakSSEDep]>;
|
||||
// Sandy Bridge does not have FMA
|
||||
def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>;
|
||||
def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit,
|
||||
FeatureBreakSSEDep]>;
|
||||
|
||||
def : Proc<"k6", [FeatureMMX]>;
|
||||
def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>;
|
||||
|
@ -2370,6 +2370,23 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||
// Check switch flag
|
||||
if (NoFusing) return NULL;
|
||||
|
||||
if (TM.getSubtarget<X86Subtarget>().shouldBreakSSEDep())
|
||||
switch (MI->getOpcode()) {
|
||||
case X86::CVTSD2SSrr:
|
||||
case X86::Int_CVTSD2SSrr:
|
||||
case X86::CVTSS2SDrr:
|
||||
case X86::Int_CVTSS2SDrr:
|
||||
case X86::RCPSSr:
|
||||
case X86::RCPSSr_Int:
|
||||
case X86::ROUNDSDr_Int:
|
||||
case X86::ROUNDSSr_Int:
|
||||
case X86::RSQRTSSr:
|
||||
case X86::RSQRTSSr_Int:
|
||||
case X86::SQRTSSr:
|
||||
case X86::SQRTSSr_Int:
|
||||
return 0;
|
||||
}
|
||||
|
||||
const MachineFrameInfo *MFI = MF.getFrameInfo();
|
||||
unsigned Size = MFI->getObjectSize(FrameIndex);
|
||||
unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
|
||||
@ -2405,6 +2422,23 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||
// Check switch flag
|
||||
if (NoFusing) return NULL;
|
||||
|
||||
if (TM.getSubtarget<X86Subtarget>().shouldBreakSSEDep())
|
||||
switch (MI->getOpcode()) {
|
||||
case X86::CVTSD2SSrr:
|
||||
case X86::Int_CVTSD2SSrr:
|
||||
case X86::CVTSS2SDrr:
|
||||
case X86::Int_CVTSS2SDrr:
|
||||
case X86::RCPSSr:
|
||||
case X86::RCPSSr_Int:
|
||||
case X86::ROUNDSDr_Int:
|
||||
case X86::ROUNDSSr_Int:
|
||||
case X86::RSQRTSSr:
|
||||
case X86::RSQRTSSr_Int:
|
||||
case X86::SQRTSSr:
|
||||
case X86::SQRTSSr_Int:
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Determine the alignment of the load.
|
||||
unsigned Alignment = 0;
|
||||
if (LoadMI->hasOneMemOperand())
|
||||
|
@ -301,6 +301,8 @@ def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
|
||||
def OptForSpeed : Predicate<"!OptForSize">;
|
||||
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
|
||||
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
|
||||
def SSEBreakDep : Predicate<"Subtarget->shouldBreakSSEDep() && !OptForSize">;
|
||||
def NoSSEBreakDep: Predicate<"!Subtarget->shouldBreakSSEDep() || OptForSize">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// X86 Instruction Format Definitions.
|
||||
|
@ -824,9 +824,10 @@ multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
|
||||
}
|
||||
|
||||
// Scalar operation, mem.
|
||||
def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
|
||||
def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
|
||||
!strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
|
||||
[(set FR32:$dst, (OpNode (load addr:$src)))]>;
|
||||
[(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
|
||||
Requires<[HasSSE1, NoSSEBreakDep]>;
|
||||
|
||||
// Vector operation, reg.
|
||||
def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
|
||||
@ -1116,9 +1117,10 @@ def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
|
||||
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
|
||||
"cvtsd2ss\t{$src, $dst|$dst, $src}",
|
||||
[(set FR32:$dst, (fround FR64:$src))]>;
|
||||
def CVTSD2SSrm : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
|
||||
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
|
||||
"cvtsd2ss\t{$src, $dst|$dst, $src}",
|
||||
[(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
|
||||
[(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
|
||||
Requires<[HasSSE2, NoSSEBreakDep]>;
|
||||
def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
|
||||
"cvtsi2sd\t{$src, $dst|$dst, $src}",
|
||||
[(set FR64:$dst, (sint_to_fp GR32:$src))]>;
|
||||
@ -1155,7 +1157,10 @@ def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
|
||||
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
|
||||
"cvtss2sd\t{$src, $dst|$dst, $src}",
|
||||
[(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
|
||||
Requires<[HasSSE2]>;
|
||||
Requires<[HasSSE2, NoSSEBreakDep]>;
|
||||
|
||||
def : Pat<(extloadf32 addr:$src),
|
||||
(CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[SSEBreakDep]>;
|
||||
|
||||
// Match intrinsics which expect XMM operand(s).
|
||||
def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
|
||||
@ -3220,13 +3225,14 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
|
||||
OpSize;
|
||||
|
||||
// Vector intrinsic operation, mem
|
||||
def PSm_Int : SS4AIi8<opcps, MRMSrcMem,
|
||||
def PSm_Int : Ii8<opcps, MRMSrcMem,
|
||||
(outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
"ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set VR128:$dst,
|
||||
(V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
|
||||
OpSize;
|
||||
TA, OpSize,
|
||||
Requires<[HasSSE41, NoSSEBreakDep]>;
|
||||
|
||||
// Vector intrinsic operation, reg
|
||||
def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
|
||||
|
@ -266,6 +266,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
|
||||
unsigned Model = 0;
|
||||
DetectFamilyModel(EAX, Family, Model);
|
||||
IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
|
||||
BreakSSEDep = IsIntel;
|
||||
|
||||
GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
|
||||
HasX86_64 = (EDX >> 29) & 0x1;
|
||||
@ -286,6 +287,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
|
||||
, HasFMA3(false)
|
||||
, HasFMA4(false)
|
||||
, IsBTMemSlow(false)
|
||||
, BreakSSEDep(false)
|
||||
, DarwinVers(0)
|
||||
, stackAlignment(8)
|
||||
// FIXME: this is a known good value for Yonah. How about others?
|
||||
|
@ -77,6 +77,14 @@ protected:
|
||||
|
||||
/// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
|
||||
bool IsBTMemSlow;
|
||||
|
||||
/// BreakSSEDep - True if codegen should unfold load or insert xorps / pxor
|
||||
/// to break register dependency for a partial register update SSE
|
||||
/// instruction. This is needed for instructions such as CVTSS2SD which
|
||||
/// only update the lower part of the register, and the result of the updated
|
||||
/// part does not depend on the contents of the destination before the
|
||||
/// instruction, and the non-updated portion of the register is not used.
|
||||
bool BreakSSEDep;
|
||||
|
||||
/// DarwinVers - Nonzero if this is a darwin platform: the numeric
|
||||
/// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
|
||||
@ -142,6 +150,7 @@ public:
|
||||
bool hasFMA3() const { return HasFMA3; }
|
||||
bool hasFMA4() const { return HasFMA4; }
|
||||
bool isBTMemSlow() const { return IsBTMemSlow; }
|
||||
bool shouldBreakSSEDep() const { return BreakSSEDep; }
|
||||
|
||||
bool isTargetDarwin() const { return TargetType == isDarwin; }
|
||||
bool isTargetELF() const { return TargetType == isELF; }
|
||||
|
28
test/CodeGen/X86/break-sse-dep.ll
Normal file
28
test/CodeGen/X86/break-sse-dep.ll
Normal file
@ -0,0 +1,28 @@
|
||||
; RUN: llc < %s -march=x86-64 -mattr=+sse2,+break-sse-dep | FileCheck %s --check-prefix=YES
|
||||
; RUN: llc < %s -march=x86-64 -mattr=+sse2,-break-sse-dep | FileCheck %s --check-prefix=NO
|
||||
|
||||
define double @t1(float* nocapture %x) nounwind readonly ssp {
|
||||
entry:
|
||||
; YES: t1:
|
||||
; YES: movss (%rdi), %xmm0
|
||||
; YES; cvtss2sd %xmm0, %xmm0
|
||||
|
||||
; NO: t1:
|
||||
; NO; cvtss2sd (%rdi), %xmm0
|
||||
%0 = load float* %x, align 4
|
||||
%1 = fpext float %0 to double
|
||||
ret double %1
|
||||
}
|
||||
|
||||
define float @t2(double* nocapture %x) nounwind readonly ssp {
|
||||
entry:
|
||||
; YES: t2:
|
||||
; YES: movsd (%rdi), %xmm0
|
||||
; YES; cvtsd2ss %xmm0, %xmm0
|
||||
|
||||
; NO: t2:
|
||||
; NO; cvtsd2ss (%rdi), %xmm0
|
||||
%0 = load double* %x, align 8
|
||||
%1 = fptrunc double %0 to float
|
||||
ret float %1
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user