Fix patterns for SSE4.1 move and sign extend instructions. Also add instructions which fold VZEXT_MOVL and VZEXT_LOAD.

llvm-svn: 56594
This commit is contained in:
Evan Cheng 2008-09-24 23:27:55 +00:00
parent 1d0ed88bf9
commit efd1f614ff
3 changed files with 133 additions and 6 deletions

View File

@ -243,6 +243,16 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (ld node:$ptr)), [{
return false;
}]>;
def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (ld node:$ptr)), [{
LoadSDNode *LD = cast<LoadSDNode>(N);
if (LD->getAddressingMode() != ISD::UNINDEXED)
return false;
ISD::LoadExtType ExtType = LD->getExtensionType();
if (ExtType == ISD::EXTLOAD)
return LD->getAlignment() >= 2 && !LD->isVolatile();
return false;
}]>;
def loadi32 : PatFrag<(ops node:$ptr), (i32 (ld node:$ptr)), [{
LoadSDNode *LD = cast<LoadSDNode>(N);
if (LD->getAddressingMode() != ISD::UNINDEXED)

View File

@ -162,6 +162,17 @@ def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
def vzmovl_v2i64 : PatFrag<(ops node:$src),
(bitconvert (v2i64 (X86vzmovl
(v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
def vzmovl_v4i32 : PatFrag<(ops node:$src),
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
def vzload_v2i64 : PatFrag<(ops node:$src),
(bitconvert (v2i64 (X86vzload node:$src)))>;
def fp32imm0 : PatLeaf<(f32 fpimm), [{
return N->isExactlyValue(+0.0);
}]>;
@ -3368,8 +3379,9 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize;
[(set VR128:$dst,
(IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
OpSize;
}
defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
@ -3379,6 +3391,38 @@ defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
// Common patterns involving scalar load.
def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
(PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
(PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
(PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
(PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
(PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
(PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
(PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
(PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
(PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
(PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
(PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
(PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@ -3386,8 +3430,9 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize;
[(set VR128:$dst,
(IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
OpSize;
}
defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
@ -3395,20 +3440,45 @@ defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
(PMOVSXBDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
(PMOVSXWQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
(PMOVZXBDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
(PMOVZXWQrm addr:$src)>;
multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
// Expecting a i16 load any extended to i32 value.
def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize;
[(set VR128:$dst, (IntId (bitconvert
(v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
OpSize;
}
defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>;
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbq
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
(PMOVSXBQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbq
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
(PMOVZXBQrm addr:$src)>;
/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {

View File

@ -0,0 +1,47 @@
; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | not grep movd
; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | not grep movq
; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxbd
; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxwd
; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxbq
; RUN: llvm-as < %s | llc -march=x86-64 -mattr=sse41 | grep movq | count 1
define <2 x i64> @t1(i32* %p) nounwind {
entry:
%0 = load i32* %p, align 4 ; <i32> [#uses=1]
%1 = insertelement <4 x i32> undef, i32 %0, i32 0 ; <<4 x i32>> [#uses=1]
%2 = insertelement <4 x i32> %1, i32 0, i32 1 ; <<4 x i32>> [#uses=1]
%3 = insertelement <4 x i32> %2, i32 0, i32 2 ; <<4 x i32>> [#uses=1]
%4 = insertelement <4 x i32> %3, i32 0, i32 3 ; <<4 x i32>> [#uses=1]
%5 = bitcast <4 x i32> %4 to <16 x i8> ; <<16 x i8>> [#uses=1]
%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone ; <<4 x i32>> [#uses=1]
%7 = bitcast <4 x i32> %6 to <2 x i64> ; <<2 x i64>> [#uses=1]
ret <2 x i64> %7
}
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
define <2 x i64> @t2(i64* %p) nounwind readonly {
entry:
%0 = load i64* %p ; <i64> [#uses=1]
%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1]
%1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1]
%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1]
%3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1]
ret <2 x i64> %3
}
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
@gv = external global i16 ; <i16*> [#uses=1]
define <2 x i64> @t3() nounwind {
entry:
%0 = load i16* @gv, align 2 ; <i16> [#uses=1]
%1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
%2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %3
}
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone