mirror of
https://github.com/RPCS3/llvm.git
synced 2024-11-30 23:21:04 +00:00
[X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding support
As discussed on PR26491, this patch adds support for lowering v4f32 shuffles to the MOVLHPS/MOVHLPS instructions. It also adds support for memory folding with their MOVLPS/MOVHPS load equivalents. This first patch only really helps SSE1 targets as SSE2+ targets will widen the shuffle mask and use v2f64 equivalents (although they still combine to MOVLHPS/MOVHLPS for v2f64 splats). This will have to be addressed in a future patch, most likely when we add support for binary target shuffle combines. Differential Revision: http://reviews.llvm.org/D16956 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@260168 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
68c43d458e
commit
8464bb8c41
@ -9022,6 +9022,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
return BlendPerm;
|
||||
}
|
||||
|
||||
// Use low/high mov instructions.
|
||||
if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
|
||||
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
|
||||
if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
|
||||
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
if (SDValue V =
|
||||
lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
|
||||
|
@ -998,6 +998,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
|
||||
{ X86::MINSSrr, X86::MINSSrm, 0 },
|
||||
{ X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
|
||||
{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
|
||||
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
|
||||
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
|
||||
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
|
||||
@ -1298,6 +1299,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
|
||||
{ X86::VMINSSrr, X86::VMINSSrm, 0 },
|
||||
{ X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
|
||||
{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
|
||||
{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
|
||||
{ X86::VMULPDrr, X86::VMULPDrm, 0 },
|
||||
{ X86::VMULPSrr, X86::VMULPSrm, 0 },
|
||||
@ -5531,6 +5533,23 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
|
||||
}
|
||||
}
|
||||
break;
|
||||
case X86::MOVHLPSrr:
|
||||
case X86::VMOVHLPSrr:
|
||||
// Move the upper 64-bits of the second operand to the lower 64-bits.
|
||||
// To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
|
||||
// TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
|
||||
if (OpNum == 2) {
|
||||
unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
|
||||
if (Size <= RCSize && 8 <= Align) {
|
||||
unsigned NewOpCode =
|
||||
(MI->getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm
|
||||
: X86::MOVLPSrm);
|
||||
MachineInstr *NewMI =
|
||||
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
|
||||
return NewMI;
|
||||
}
|
||||
}
|
||||
break;
|
||||
};
|
||||
|
||||
return nullptr;
|
||||
@ -5741,6 +5760,10 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
|
||||
case X86::CVTSS2SDrm:
|
||||
case X86::Int_CVTSS2SDrr:
|
||||
case X86::Int_CVTSS2SDrm:
|
||||
case X86::MOVHPDrm:
|
||||
case X86::MOVHPSrm:
|
||||
case X86::MOVLPDrm:
|
||||
case X86::MOVLPSrm:
|
||||
case X86::RCPSSr:
|
||||
case X86::RCPSSm:
|
||||
case X86::RCPSSr_Int:
|
||||
|
@ -94,7 +94,7 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
|
||||
define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
|
||||
; SSE1-LABEL: shuffle_v4f32_0145:
|
||||
; SSE1: # BB#0:
|
||||
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
|
||||
; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSE1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
ret <4 x float> %shuffle
|
||||
@ -102,8 +102,7 @@ define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
|
||||
define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
|
||||
; SSE1-LABEL: shuffle_v4f32_6723:
|
||||
; SSE1: # BB#0:
|
||||
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[2,3]
|
||||
; SSE1-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE1-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
|
||||
; SSE1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
|
||||
ret <4 x float> %shuffle
|
||||
@ -211,7 +210,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
|
||||
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; SSE1-NEXT: xorps %xmm2, %xmm2
|
||||
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
|
||||
; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
|
||||
; SSE1-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE1-NEXT: retq
|
||||
@ -232,8 +231,8 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
|
||||
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; SSE1-NEXT: xorps %xmm2, %xmm2
|
||||
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
|
||||
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
|
||||
; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSE1-NEXT: retq
|
||||
%a = load <2 x float>, <2 x float>* %ptr
|
||||
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
@ -251,3 +250,21 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
|
||||
%shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
|
||||
; SSE1-LABEL: shuffle_mem_v4f32_0145:
|
||||
; SSE1: # BB#0:
|
||||
; SSE1-NEXT: movhps (%rdi), %xmm0
|
||||
; SSE1-NEXT: retq
|
||||
%b = load <4 x float>, <4 x float>* %pb, align 16
|
||||
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) {
|
||||
; SSE1-LABEL: shuffle_mem_v4f32_6723:
|
||||
; SSE1: # BB#0:
|
||||
; SSE1-NEXT: movlps 8(%rdi), %xmm0
|
||||
; SSE1-NEXT: retq
|
||||
%b = load <4 x float>, <4 x float>* %pb, align 16
|
||||
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user