Formalize the notion that AVX and SSE are non-overlapping extensions from the compiler's point of view. Per email discussion, we either want to always use VEX-prefixed instructions or never use them, and are taking "HasAVX" to mean "Always use VEX". Passing -mattr=-avx,+sse42 should serve to restore legacy SSE support when desirable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@121439 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nate Begeman 2010-12-10 00:26:57 +00:00
parent 1c952b9cc9
commit 2ea8ee7c76
8 changed files with 61 additions and 50 deletions

View File

@ -116,11 +116,11 @@ def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
FeatureFastUAMem]>;
// Westmere is a similar machine to nehalem with some additional features.
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
FeatureFastUAMem, FeatureAES]>;
// Sandy Bridge does not have FMA
// FIXME: Wikipedia says it does... it should have AES as well.
def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>;
def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
FeatureFastUAMem, FeatureAES, FeatureCLMUL]>;
// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
// rather than a superset.
def : Proc<"sandybridge", [FeatureAVX, FeatureAES, FeatureCLMUL, Feature64Bit]>;
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>;

View File

@ -61,7 +61,7 @@ def RetCC_X86_32_C : CallingConv<[
// weirdly; this is really the sse-regparm calling convention) in which
// case they use XMM0, otherwise it is the same as the common X86 calling
// conv.
CCIfInReg<CCIfSubtarget<"hasSSE2()",
CCIfInReg<CCIfSubtarget<"hasXMMInt()",
CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
CCDelegateTo<RetCC_X86Common>
@ -73,8 +73,8 @@ def RetCC_X86_32_Fast : CallingConv<[
// SSE2.
// This can happen when a float, 2 x float, or 3 x float vector is split by
// target lowering, and is returned in 1-3 sse regs.
CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
CCIfType<[f32], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
CCIfType<[f64], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
// For integers, ECX can be used as an extra return register
CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>,
@ -163,12 +163,12 @@ def CC_X86_64_C : CallingConv<[
// registers on Darwin.
CCIfType<[x86mmx],
CCIfSubtarget<"isTargetDarwin()",
CCIfSubtarget<"hasSSE2()",
CCIfSubtarget<"hasXMMInt()",
CCPromoteToType<v2i64>>>>,
// The first 8 FP/Vector arguments are passed in XMM registers.
CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCIfSubtarget<"hasSSE1()",
CCIfSubtarget<"hasXMM()",
CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
// The first 8 256-bit vector arguments are passed in YMM registers.
@ -245,7 +245,7 @@ def CC_X86_64_GHC : CallingConv<[
// Pass in STG registers: F1, F2, F3, F4, D1, D2
CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCIfSubtarget<"hasSSE1()",
CCIfSubtarget<"hasXMM()",
CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
]>;
@ -263,7 +263,7 @@ def CC_X86_32_Common : CallingConv<[
// The first 3 float or double arguments, if marked 'inreg' and if the call
// is not a vararg call and if SSE2 is available, are passed in SSE registers.
CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
CCIfSubtarget<"hasSSE2()",
CCIfSubtarget<"hasXMMInt()",
CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
// The first 3 __m64 (except for v1i64) vector arguments are passed in mmx
@ -362,7 +362,7 @@ def CC_X86_32_FastCC : CallingConv<[
// The first 3 float or double arguments, if the call is not a vararg
// call and if SSE2 is available, are passed in SSE registers.
CCIfNotVarArg<CCIfType<[f32,f64],
CCIfSubtarget<"hasSSE2()",
CCIfSubtarget<"hasXMMInt()",
CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
// Doubles get 8-byte slots that are 8-byte aligned.

View File

@ -81,8 +81,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
: TargetLowering(TM, createTLOF(TM)) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
X86ScalarSSEf64 = Subtarget->hasXMMInt();
X86ScalarSSEf32 = Subtarget->hasXMM();
X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
RegInfo = TM.getRegisterInfo();
@ -356,7 +356,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
}
if (Subtarget->hasSSE1())
if (Subtarget->hasXMM())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
// We may not have a libcall for MEMBARRIER so we should lower this.
@ -664,7 +664,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
if (!UseSoftFloat && Subtarget->hasSSE1()) {
if (!UseSoftFloat && Subtarget->hasXMM()) {
addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
@ -681,7 +681,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
}
if (!UseSoftFloat && Subtarget->hasSSE2()) {
if (!UseSoftFloat && Subtarget->hasXMMInt()) {
addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
// FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
@ -1043,7 +1043,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
}
unsigned Align = 4;
if (Subtarget->hasSSE1())
if (Subtarget->hasXMM())
getMaxByValAlign(Ty, Align);
return Align;
}
@ -1084,7 +1084,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
} else if (!MemcpyStrSrc && Size >= 8 &&
!Subtarget->is64Bit() &&
Subtarget->getStackAlignment() >= 8 &&
Subtarget->hasSSE2()) {
Subtarget->hasXMMInt()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
return MVT::f64;
@ -1272,14 +1272,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
(Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
(Subtarget->is64Bit() && !Subtarget->hasXMM())) {
report_fatal_error("SSE register return with SSE disabled");
}
// Likewise we can't return F64 values with SSE1 only. gcc does so, but
// llvm-gcc has never done it right and no one has noticed, so this
// should be OK for now.
if (ValVT == MVT::f64 &&
(Subtarget->is64Bit() && !Subtarget->hasSSE2()))
(Subtarget->is64Bit() && !Subtarget->hasXMMInt()))
report_fatal_error("SSE2 register return with SSE2 disabled");
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
@ -1391,7 +1391,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) {
report_fatal_error("SSE register return with SSE disabled");
}
@ -1700,11 +1700,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
TotalNumIntRegs);
bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
assert(!(NumXMMRegs && !Subtarget->hasXMM()) &&
"SSE register cannot be used when SSE is disabled!");
assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM())
// Kernel mode asks for SSE to be disabled, so don't push them
// on the stack.
TotalNumXMMRegs = 0;
@ -2055,7 +2055,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
assert((Subtarget->hasSSE1() || !NumXMMRegs)
assert((Subtarget->hasXMM() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
@ -7635,7 +7635,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(!UseSoftFloat &&
!(DAG.getMachineFunction()
.getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
Subtarget->hasSSE1());
Subtarget->hasXMM());
}
// Insert VAARG_64 node into the DAG
@ -11689,7 +11689,7 @@ TargetLowering::ConstraintWeight
break;
case 'x':
case 'Y':
if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1())
if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM())
weight = CW_Register;
break;
case 'I':
@ -11759,9 +11759,9 @@ LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
if (Subtarget->hasSSE2())
if (Subtarget->hasXMMInt())
return "Y";
if (Subtarget->hasSSE1())
if (Subtarget->hasXMM())
return "x";
}
@ -11991,10 +11991,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
if (!Subtarget->hasMMX()) break;
return std::make_pair(0U, X86::VR64RegisterClass);
case 'Y': // SSE_REGS if SSE2 allowed
if (!Subtarget->hasSSE2()) break;
if (!Subtarget->hasXMMInt()) break;
// FALL THROUGH.
case 'x': // SSE_REGS if SSE1 allowed
if (!Subtarget->hasSSE1()) break;
if (!Subtarget->hasXMM()) break;
switch (VT.getSimpleVT().SimpleTy) {
default: break;

View File

@ -400,26 +400,26 @@ def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
def HasCMov : Predicate<"Subtarget->hasCMov()">;
def NoCMov : Predicate<"!Subtarget->hasCMov()">;
// FIXME: temporary hack to let codegen assert or generate poor code in case
// no AVX version of the desired intructions is present, this is better for
// incremental dev (without fallbacks it's easier to spot what's missing)
def HasMMX : Predicate<"Subtarget->hasMMX() && !Subtarget->hasAVX()">;
def HasMMX : Predicate<"Subtarget->hasMMX()">;
def Has3DNow : Predicate<"Subtarget->has3DNow()">;
def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
def HasSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
def HasSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
def HasSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
def HasSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
def HasSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
def HasSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
def HasSSE4A : Predicate<"Subtarget->hasSSE4A() && !Subtarget->hasAVX()">;
def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasXMMInt : Predicate<"Subtarget->hasXMMInt()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">;
def HasFMA3 : Predicate<"Subtarget->hasFMA3()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def FPStackf32 : Predicate<"!Subtarget->hasXMM()">;
def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">;
def In32BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate;
def In64BitMode : Predicate<"Subtarget->is64Bit()">, AssemblerPredicate;
def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
@ -436,7 +436,6 @@ def OptForSize : Predicate<"OptForSize">;
def OptForSpeed : Predicate<"!OptForSize">;
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
def HasAES : Predicate<"Subtarget->hasAES()">;
//===----------------------------------------------------------------------===//
// X86 Instruction Format Definitions.

View File

@ -712,6 +712,8 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V;
}
def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (fround FR64:$src))]>;
@ -739,6 +741,8 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>;
}
def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>;
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (fextend FR32:$src))]>, XS,
@ -3680,7 +3684,7 @@ let Predicates = [HasSSE2] in
(CVTSS2SDrm addr:$src)>;
// bit_convert
let Predicates = [HasSSE2] in {
let Predicates = [HasXMMInt] in {
def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
@ -3713,6 +3717,10 @@ let Predicates = [HasSSE2] in {
def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
}
let Predicates = [HasAVX] in {
def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
}
// Move scalar to XMM zero-extended
// movd to XMM register zero-extends
let AddedComplexity = 15 in {

View File

@ -256,13 +256,13 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
if ((ECX >> 9) & 1) X86SSELevel = SSSE3;
if ((ECX >> 19) & 1) X86SSELevel = SSE41;
if ((ECX >> 20) & 1) X86SSELevel = SSE42;
if ((ECX >> 28) & 1) { HasAVX = true; X86SSELevel = NoMMXSSE; }
bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
HasCLMUL = IsIntel && ((ECX >> 1) & 0x1);
HasFMA3 = IsIntel && ((ECX >> 12) & 0x1);
HasAVX = ((ECX >> 28) & 0x1);
HasAES = IsIntel && ((ECX >> 25) & 0x1);
if (IsIntel || IsAMD) {
@ -316,11 +316,13 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
ParseSubtargetFeatures(FS, CPU);
// All X86-64 CPUs also have SSE2, however user might request no SSE via
// -mattr, so don't force SSELevel here.
if (HasAVX)
X86SSELevel = NoMMXSSE;
} else {
// Otherwise, use CPUID to auto-detect feature set.
AutoDetectSubtargetFeatures();
// Make sure SSE2 is enabled; it is available on all X86-64 CPUs.
if (Is64Bit && X86SSELevel < SSE2)
if (Is64Bit && !HasAVX && X86SSELevel < SSE2)
X86SSELevel = SSE2;
}

View File

@ -155,6 +155,8 @@ public:
bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
bool hasPOPCNT() const { return HasPOPCNT; }
bool hasAVX() const { return HasAVX; }
bool hasXMM() const { return hasSSE1() || hasAVX(); }
bool hasXMMInt() const { return hasSSE2() || hasAVX(); }
bool hasAES() const { return HasAES; }
bool hasCLMUL() const { return HasCLMUL; }
bool hasFMA3() const { return HasFMA3; }

View File

@ -4,7 +4,7 @@
define void @zero() nounwind ssp {
entry:
; CHECK: vpxor
; CHECK: vxorps
; CHECK: vmovaps
store <4 x float> zeroinitializer, <4 x float>* @z, align 16
ret void