SSE 4.1 Intrinsics and detection

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@46681 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Nate Begeman 2008-02-03 07:18:54 +00:00
parent fcd8e9e3a2
commit 63ec90a6a8
7 changed files with 269 additions and 1 deletions

View File

@ -673,6 +673,156 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
// SSE4.1
// FP rounding ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_round_ss : GCCBuiltin<"__builtin_ia32_roundss">,
Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_sse41_round_ps : GCCBuiltin<"__builtin_ia32_roundps">,
Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_sse41_round_sd : GCCBuiltin<"__builtin_ia32_roundsd">,
Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_sse41_round_pd : GCCBuiltin<"__builtin_ia32_roundpd">,
Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty,
llvm_i32_ty], [IntrNoMem]>;
}
// Vector sign and zero extend
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_pmovsxbd : GCCBuiltin<"__builtin_ia32_pmovsxbd128">,
Intrinsic<[llvm_v4i32_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pmovsxbq : GCCBuiltin<"__builtin_ia32_pmovsxbq128">,
Intrinsic<[llvm_v2i64_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pmovsxbw : GCCBuiltin<"__builtin_ia32_pmovsxbw128">,
Intrinsic<[llvm_v8i16_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pmovsxdq : GCCBuiltin<"__builtin_ia32_pmovsxdq128">,
Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty]>;
def int_x86_sse41_pmovsxwd : GCCBuiltin<"__builtin_ia32_pmovsxwd128">,
Intrinsic<[llvm_v4i32_ty, llvm_v8i16_ty]>;
def int_x86_sse41_pmovsxwq : GCCBuiltin<"__builtin_ia32_pmovsxwq128">,
Intrinsic<[llvm_v2i64_ty, llvm_v8i16_ty]>;
def int_x86_sse41_pmovzxbd : GCCBuiltin<"__builtin_ia32_pmovzxbd128">,
Intrinsic<[llvm_v4i32_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pmovzxbq : GCCBuiltin<"__builtin_ia32_pmovzxbq128">,
Intrinsic<[llvm_v2i64_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pmovzxbw : GCCBuiltin<"__builtin_ia32_pmovzxbw128">,
Intrinsic<[llvm_v8i16_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pmovzxdq : GCCBuiltin<"__builtin_ia32_pmovzxdq128">,
Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty]>;
def int_x86_sse41_pmovzxwd : GCCBuiltin<"__builtin_ia32_pmovzxwd128">,
Intrinsic<[llvm_v4i32_ty, llvm_v8i16_ty]>;
def int_x86_sse41_pmovzxwq : GCCBuiltin<"__builtin_ia32_pmovzxwq128">,
Intrinsic<[llvm_v2i64_ty, llvm_v8i16_ty]>;
}
// Vector min element
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_phminposuw : GCCBuiltin<"__builtin_ia32_phminposuw128">,
Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty]>;
}
// Vector compare, min, max
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_pcmpeqq : GCCBuiltin<"__builtin_ia32_pcmpeqq">,
Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty]>;
def int_x86_sse41_pmaxsb : GCCBuiltin<"__builtin_ia32_pmaxsb128">,
Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pmaxsd : GCCBuiltin<"__builtin_ia32_pmaxsd128">,
Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
def int_x86_sse41_pmaxud : GCCBuiltin<"__builtin_ia32_pmaxud128">,
Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
def int_x86_sse41_pmaxuw : GCCBuiltin<"__builtin_ia32_pmaxuw128">,
Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty]>;
def int_x86_sse41_pminsb : GCCBuiltin<"__builtin_ia32_pminsb128">,
Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pminsd : GCCBuiltin<"__builtin_ia32_pminsd128">,
Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
def int_x86_sse41_pminud : GCCBuiltin<"__builtin_ia32_pminud128">,
Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
def int_x86_sse41_pminuw : GCCBuiltin<"__builtin_ia32_pminuw128">,
Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty]>;
}
// Vector pack
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_packusdw : GCCBuiltin<"__builtin_ia32_packusdw128">,
Intrinsic<[llvm_v8i16_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
}
// Vector multiply
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_pmuldq : GCCBuiltin<"__builtin_ia32_pmuldq128">,
Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
def int_x86_sse41_pmulld : GCCBuiltin<"__builtin_ia32_pmulld128">,
Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
}
// Vector extract
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_pextrb : GCCBuiltin<"__builtin_ia32_vec_ext_v16qi">,
Intrinsic<[llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty]>;
def int_x86_sse41_pextrd : GCCBuiltin<"__builtin_ia32_vec_ext_v4si">,
Intrinsic<[llvm_i32_ty, llvm_v4i32_ty, llvm_i32_ty]>;
def int_x86_sse41_pextrq : GCCBuiltin<"__builtin_ia32_vec_ext_v2di">,
Intrinsic<[llvm_i64_ty, llvm_v2i64_ty, llvm_i32_ty]>;
def int_x86_sse41_extractps : GCCBuiltin<"__builtin_ia32_extractps128">,
Intrinsic<[llvm_i32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
}
// Vector insert
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_pinsrb : GCCBuiltin<"__builtin_ia32_vec_set_v16qi">,
Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty]>;
def int_x86_sse41_pinsrd : GCCBuiltin<"__builtin_ia32_vec_set_v4si">,
Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty]>;
def int_x86_sse41_pinsrq : GCCBuiltin<"__builtin_ia32_vec_set_v2di">,
Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_i64_ty, llvm_i32_ty]>;
def int_x86_sse41_insertps : GCCBuiltin<"__builtin_ia32_insertps128">,
Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
}
// Vector blend
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb128">,
Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
def int_x86_sse41_pblendw : GCCBuiltin<"__builtin_ia32_pblendw128">,
Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty]>;
def int_x86_sse41_blendpd : GCCBuiltin<"__builtin_ia32_blendpd">,
Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty]>;
def int_x86_sse41_blendps : GCCBuiltin<"__builtin_ia32_blendps">,
Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
def int_x86_sse41_blendvpd : GCCBuiltin<"__builtin_ia32_blendvpd">,
Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty]>;
def int_x86_sse41_blendvps : GCCBuiltin<"__builtin_ia32_blendvps">,
Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty]>;
}
// Vector dot product
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_dppd : GCCBuiltin<"__builtin_ia32_dppd">,
Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty]>;
def int_x86_sse41_dpps : GCCBuiltin<"__builtin_ia32_dpps">,
Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
}
// Vector sum of absolute differences
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty]>;
}
// Vector sum of absolute differences
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa">,
Intrinsic<[llvm_v2i64_ty, llvm_ptr_ty]>;
}
//===----------------------------------------------------------------------===//
// MMX

View File

@ -34,6 +34,12 @@ def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
"Enable SSSE3 instructions",
[FeatureSSE3]>;
def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
"Enable SSE 4.1 instructions",
[FeatureSSSE3]>;
def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
"Enable SSE 4.2 instructions",
[FeatureSSE41]>;
def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
"Enable 3DNow! instructions">;
def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
@ -66,6 +72,7 @@ def : Proc<"yonah", [FeatureSSE3]>;
def : Proc<"prescott", [FeatureSSE3]>;
def : Proc<"nocona", [FeatureSSE3]>;
def : Proc<"core2", [FeatureSSSE3]>;
def : Proc<"penryn", [FeatureSSE41]>;
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>;

View File

@ -1266,3 +1266,13 @@ def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
//===----------------------------------------------------------------------===//
// X86-64 SSE4.1 Instructions
//===----------------------------------------------------------------------===//
// PEXTRB, unary, TA, 0x14, REX.W
// PEXTRW, unary, TA, 0x15, REX.W
// PEXTRQ, unary, TA, 0x16, REX.W
// EXTRACTPS, unary, TA, 0x17, REX.W
// PINSRQ, 2addr, binary, TA, 0x22, REX.W

View File

@ -166,6 +166,8 @@ def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def In32BitMode : Predicate<"!Subtarget->is64Bit()">;

View File

@ -3038,3 +3038,98 @@ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
//===----------------------------------------------------------------------===//
// SSE4.1 Instructions
//===----------------------------------------------------------------------===//
// SSE4.1 Instruction Templates:
//
// SS418I - SSE 4.1 instructions with T8 prefix.
// SS41AI - SSE 4.1 instructions with TA prefix.
//
class SS418I<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern>
: I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
class SS41AI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern>
: I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
multiclass sse41_fp_unop_rm<bits<8> opcss, bits<8> opcps,
bits<8> opcsd, bits<8> opcpd,
string OpcodeStr,
Intrinsic F32Int,
Intrinsic V4F32Int,
Intrinsic F64Int,
Intrinsic V2F64Int,
bit Commutable = 0> {
// Intrinsic operation, reg.
def SSr_Int : SS41AI<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (F32Int VR128:$src1, imm:$src2))]> {
let isCommutable = Commutable;
}
// Intrinsic operation, mem.
def SSm_Int : SS41AI<opcss, MRMSrcMem,
(outs VR128:$dst), (ins ssmem:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (F32Int sse_load_f32:$src1, imm:$src2))]>;
// Vector intrinsic operation, reg
def PSr_Int : SS41AI<opcps, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]> {
let isCommutable = Commutable;
}
// Vector intrinsic operation, mem
def PSm_Int : SS41AI<opcps, MRMSrcMem,
(outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (V4F32Int (load addr:$src1), imm:$src2))]>;
// Intrinsic operation, reg.
def SDr_Int : SS41AI<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (F64Int VR128:$src1, imm:$src2))]> {
let isCommutable = Commutable;
}
// Intrinsic operation, mem.
def SDm_Int : SS41AI<opcsd, MRMSrcMem,
(outs VR128:$dst), (ins sdmem:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (F64Int sse_load_f64:$src1, imm:$src2))]>;
// Vector intrinsic operation, reg
def PDr_Int : SS41AI<opcpd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]> {
let isCommutable = Commutable;
}
// Vector intrinsic operation, mem
def PDm_Int : SS41AI<opcpd, MRMSrcMem,
(outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
!strconcat(OpcodeStr,
"pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (V2F64Int (load addr:$src1), imm:$src2))]>;
}
// FP round - roundss, roundps, roundsd, roundpd
defm ROUND : sse41_fp_unop_rm<0x0A, 0x08, 0x0B, 0x09, "round",
int_x86_sse41_round_ss, int_x86_sse41_round_ps,
int_x86_sse41_round_sd, int_x86_sse41_round_pd>;

View File

@ -114,6 +114,8 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
if (ECX & 0x1) X86SSELevel = SSE3;
if ((ECX >> 9) & 0x1) X86SSELevel = SSSE3;
if ((ECX >> 19) & 0x1) X86SSELevel = SSE41;
if ((ECX >> 20) & 0x1) X86SSELevel = SSE42;
if (memcmp(text.c, "GenuineIntel", 12) == 0 ||
memcmp(text.c, "AuthenticAMD", 12) == 0) {

View File

@ -38,7 +38,7 @@ public:
};
protected:
enum X86SSEEnum {
NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3
NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
};
enum X863DNowEnum {
@ -127,6 +127,8 @@ public:
bool hasSSE2() const { return X86SSELevel >= SSE2; }
bool hasSSE3() const { return X86SSELevel >= SSE3; }
bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
bool hasSSE41() const { return X86SSELevel >= SSE41; }
bool hasSSE42() const { return X86SSELevel >= SSE42; }
bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }