From fe2d3ebbd3d27214f835e117b4c3800ae7aed80f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 7 Feb 2016 15:39:22 +0000 Subject: [PATCH] [X86][SSE] Added support for MOVHPD/MOVLPD + MOVHPS/MOVLPS shuffle decoding. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@260034 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/InstPrinter/X86InstComments.cpp | 32 ++++++++++ lib/Target/X86/Utils/X86ShuffleDecode.cpp | 11 ++++ lib/Target/X86/Utils/X86ShuffleDecode.h | 5 ++ test/CodeGen/X86/buildvec-insertvec.ll | 2 +- .../X86/merge-consecutive-loads-256.ll | 6 +- test/CodeGen/X86/vec_insert-2.ll | 60 +++++++++++++------ test/CodeGen/X86/vector-shuffle-128-v2.ll | 14 ++--- test/CodeGen/X86/vector-shuffle-128-v4.ll | 14 ++--- test/CodeGen/X86/vector-shuffle-256-v8.ll | 6 +- test/CodeGen/X86/vector-shuffle-combining.ll | 4 +- 10 files changed, 114 insertions(+), 40 deletions(-) diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 04a937807ea..5e0dd4f7bfc 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -257,6 +257,38 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeMOVHLPSMask(2, ShuffleMask); break; + case X86::MOVHPDrm: + case X86::VMOVHPDrm: + case X86::VMOVHPDZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(MVT::v2f64, 1, 1, ShuffleMask); + break; + + case X86::MOVHPSrm: + case X86::VMOVHPSrm: + case X86::VMOVHPSZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(MVT::v4f32, 2, 2, ShuffleMask); + break; + + case X86::MOVLPDrm: + case X86::VMOVLPDrm: + case X86::VMOVLPDZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(MVT::v2f64, 0, 1, ShuffleMask); + break; + + case X86::MOVLPSrm: + case X86::VMOVLPSrm: + case X86::VMOVLPSZ128rm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeInsertElementMask(MVT::v4f32, 0, 2, ShuffleMask); + break; + CASE_MOVDUP(MOVSLDUP, r) Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 08c039f3dec..e2e3f8cf0ea 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -44,6 +44,17 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl &ShuffleMask) { if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; } +void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len, + SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + assert((Idx + Len) <= NumElts && "Insertion out of range"); + + for (unsigned i = 0; i != NumElts; ++i) + ShuffleMask.push_back(i); + for (unsigned i = 0; i != Len; ++i) + ShuffleMask[Idx + i] = NumElts + i; +} + // <3,1> or <6,7,2,3> void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) { for (unsigned i = NElts / 2; i != NElts; ++i) diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 185256fd1be..f0fd04ac8b2 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -29,6 +29,11 @@ enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl &ShuffleMask); +// Insert the bottom Len elements from a second source into a vector starting at +// element Idx. +void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len, + SmallVectorImpl &ShuffleMask); + // <3,1> or <6,7,2,3> void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask); diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll index fd7290d5817..2ee33a1a902 100644 --- a/test/CodeGen/X86/buildvec-insertvec.ll +++ b/test/CodeGen/X86/buildvec-insertvec.ll @@ -47,7 +47,7 @@ entry: define <2 x double> @test_negative_zero_2(<2 x double> %A) { ; CHECK-LABEL: test_negative_zero_2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movhpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: retq entry: %0 = extractelement <2 x double> %A, i32 0 diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll index f7354b5ae6a..29d03b1e3c0 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -275,7 +275,7 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi ; AVX1: # BB#0: ; AVX1-NEXT: vmovupd 16(%rdi), %xmm0 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovhpd 40(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -283,7 +283,7 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi ; AVX2: # BB#0: ; AVX2-NEXT: vmovupd 16(%rdi), %xmm0 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovhpd 40(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -291,7 +291,7 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovups 16(%rdi), %xmm0 ; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovhpd 40(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll index fe20a474f59..2e6654185de 100644 --- a/test/CodeGen/X86/vec_insert-2.ll +++ b/test/CodeGen/X86/vec_insert-2.ll @@ -1,42 +1,68 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X32 %s -; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X64 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64 define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind { ; X32-LABEL: t1: -; X32: shufps $36 -; X32: ret - +; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; X32-NEXT: retl +; +; X64-LABEL: t1: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %tmp1 = insertelement <4 x float> %tmp, float %s, i32 3 ret <4 x float> %tmp1 } define <4 x i32> @t2(i32 %s, <4 x i32> %tmp) nounwind { ; X32-LABEL: t2: -; X32: shufps $36 -; X32: ret - +; X32: # BB#0: +; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; X32-NEXT: retl +; +; X64-LABEL: t2: +; X64: # BB#0: +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; X64-NEXT: retq %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 3 ret <4 x i32> %tmp1 } define <2 x double> @t3(double %s, <2 x double> %tmp) nounwind { ; X32-LABEL: t3: -; X32: movhpd -; X32: ret - +; X32: # BB#0: +; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X32-NEXT: retl +; ; X64-LABEL: t3: -; X64: unpcklpd -; X64: ret - +; X64: # BB#0: +; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-NEXT: movapd %xmm1, %xmm0 +; X64-NEXT: retq %tmp1 = insertelement <2 x double> %tmp, double %s, i32 1 ret <2 x double> %tmp1 } define <8 x i16> @t4(i16 %s, <8 x i16> %tmp) nounwind { ; X32-LABEL: t4: -; X32: pinsrw -; X32: ret - +; X32: # BB#0: +; X32-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t4: +; X64: # BB#0: +; X64-NEXT: pinsrw $5, %edi, %xmm0 +; X64-NEXT: retq %tmp1 = insertelement <8 x i16> %tmp, i16 %s, i32 5 ret <8 x i16> %tmp1 } diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index a2aa2025e70..e5b5c82e978 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1077,17 +1077,17 @@ define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) { define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) { ; SSE2-LABEL: insert_mem_lo_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movlpd (%rdi), %xmm0 +; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_mem_lo_v2i64: ; SSE3: # BB#0: -; SSE3-NEXT: movlpd (%rdi), %xmm0 +; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_mem_lo_v2i64: ; SSSE3: # BB#0: -; SSSE3-NEXT: movlpd (%rdi), %xmm0 +; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_mem_lo_v2i64: @@ -1173,12 +1173,12 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) { define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) { ; SSE-LABEL: insert_mem_lo_v2f64: ; SSE: # BB#0: -; SSE-NEXT: movlpd (%rdi), %xmm0 +; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_lo_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; AVX-NEXT: retq %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 @@ -1205,12 +1205,12 @@ define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) { define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) { ; SSE-LABEL: insert_mem_hi_v2f64: ; SSE: # BB#0: -; SSE-NEXT: movhpd (%rdi), %xmm0 +; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_hi_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: retq %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 2192f13dd06..35c35401d91 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1956,17 +1956,17 @@ define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) { define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { ; SSE2-LABEL: insert_mem_lo_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: movlpd (%rdi), %xmm0 +; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_mem_lo_v4i32: ; SSE3: # BB#0: -; SSE3-NEXT: movlpd (%rdi), %xmm0 +; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_mem_lo_v4i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movlpd (%rdi), %xmm0 +; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_mem_lo_v4i32: @@ -2048,12 +2048,12 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE-LABEL: insert_mem_lo_v4f32: ; SSE: # BB#0: -; SSE-NEXT: movlpd (%rdi), %xmm0 +; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_lo_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; AVX-NEXT: retq %a = load <2 x float>, <2 x float>* %ptr %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> @@ -2081,12 +2081,12 @@ define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) { define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE-LABEL: insert_mem_hi_v4f32: ; SSE: # BB#0: -; SSE-NEXT: movhpd (%rdi), %xmm0 +; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_hi_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: retq %a = load <2 x float>, <2 x float>* %ptr %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 485b79c5cfc..1908a5ae207 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2192,7 +2192,7 @@ define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) { ; ALL-LABEL: concat_v2f32_1: ; ALL: # BB#0: # %entry ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; ALL-NEXT: retq entry: %tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8 @@ -2207,7 +2207,7 @@ define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) { ; ALL-LABEL: concat_v2f32_2: ; ALL: # BB#0: # %entry ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; ALL-NEXT: retq entry: %tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8 @@ -2220,7 +2220,7 @@ define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) { ; ALL-LABEL: concat_v2f32_3: ; ALL: # BB#0: # %entry ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; ALL-NEXT: retq entry: %tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8 diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 75ce9753525..c1160c00c24 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1783,13 +1783,13 @@ define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { ; SSE-LABEL: combine_test22: ; SSE: # BB#0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movhpd (%rsi), %xmm0 +; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test22: ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: retq ; Current AVX2 lowering of this is still awful, not adding a test case. %1 = load <2 x float>, <2 x float>* %a, align 8