From 5f96ea6159852c07b5652ad38ace6b0aab375441 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 17 Jul 2018 20:16:18 +0000 Subject: [PATCH] [X86] Add patterns for folding full vector load into MOVHPS and MOVLPS with SSE1 only. llvm-svn: 337320 --- lib/Target/X86/X86InstrAVX512.td | 4 ++- lib/Target/X86/X86InstrSSE.td | 37 +++++++++++++++---------- test/CodeGen/X86/vector-shuffle-sse1.ll | 4 +-- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 3a1f840139e..2035e49720f 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6452,7 +6452,9 @@ multiclass avx512_mov_hilo_packed opc, string OpcodeStr, Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>, EVEX_4V; } -defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps, +// No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in +// SSE1. And MOVLPS pattern is even more complex. +defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag, v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl, v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 448be0eda0e..3797d91fb31 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -661,19 +661,16 @@ let Predicates = [UseSSE1] in { // SSE 1 & 2 - Move Low packed FP Instructions //===----------------------------------------------------------------------===// -multiclass sse12_mov_hilo_packed_baseopc, SDNode psnode, SDNode pdnode, +multiclass sse12_mov_hilo_packed_baseopc, SDNode pdnode, string base_opc, string asm_opr> { + // No pattern as they need be special cased between high and low. let hasSideEffects = 0, mayLoad = 1 in def PSrm : PI, PS, - Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "s", asm_opr), + [], SSEPackedSingle>, PS, + Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; - let hasSideEffects = 0, mayLoad = 1 in def PDrm : PIopc, SDNode psnode, SDNode pdnode, Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; } -multiclass sse12_mov_hilo_packedopc, SDPatternOperator psnode, - SDPatternOperator pdnode, string base_opc> { +multiclass sse12_mov_hilo_packedopc, SDPatternOperator pdnode, + string base_opc> { let Predicates = [UseAVX] in - defm V#NAME : sse12_mov_hilo_packed_base, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in - defm NAME : sse12_mov_hilo_packed_base; } -defm MOVL : sse12_mov_hilo_packed<0x12, null_frag, X86Movsd, "movlp">; +defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; let SchedRW = [WriteFStore] in { let Predicates = [UseAVX] in { @@ -725,13 +722,18 @@ let Predicates = [UseSSE1] in { def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), (iPTR 0))), addr:$src1), (MOVLPSmr addr:$src1, VR128:$src2)>; + + // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll + // end up with a movsd or bleand instead of shufp. + def : Pat<(X86Shufp (memopv4f32 addr:$src2), VR128:$src1, (i8 -28)), + (MOVLPSrm VR128:$src1, addr:$src2)>; } //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Hi packed FP Instructions //===----------------------------------------------------------------------===// -defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Unpckl, "movhp">; +defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; let SchedRW = [WriteFStore] in { // v2f64 extract element 1 is always custom lowered to unpack high to low @@ -796,6 +798,11 @@ let Predicates = [UseSSE1] in { def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), (MOVHPSrm VR128:$src1, addr:$src2)>; + + // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll + // end up with a movsd or bleand instead of shufp. + def : Pat<(X86Movlhps VR128:$src1, (memopv4f32 addr:$src2)), + (MOVHPSrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE2] in { diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll index 5f8c21bd852..dda46e062d5 100644 --- a/test/CodeGen/X86/vector-shuffle-sse1.ll +++ b/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -298,9 +298,7 @@ define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) { define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) { ; SSE1-LABEL: shuffle_mem_v4f32_4523: ; SSE1: # %bb.0: -; SSE1-NEXT: movaps (%rdi), %xmm1 -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE1-NEXT: retq %b = load <4 x float>, <4 x float>* %pb, align 16 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32>