[X86][SSE] Use (V)PINSRB for direct byte insertion in 16i8 buildvector on SSE4.1 targets

This patch allows SSE4.1 targets to use (V)PINSRB to create 16i8 vectors by inserting i8 scalars directly into a XMM register instead of merging pairs of i8 scalars into a i16 and using the SSE2 PINSRW instruction.

This allows folding of byte loads and reduces scalar register usage as well.

Differential Revision: http://reviews.llvm.org/D8839

llvm-svn: 234193
This commit is contained in:
Simon Pilgrim 2015-04-06 18:39:00 +00:00
parent d56c59218e
commit a5a17c0b23
3 changed files with 115 additions and 68 deletions

View File

@ -4460,6 +4460,29 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
SDLoc dl(Op);
SDValue V;
bool First = true;
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget->hasSSE41()) {
for (unsigned i = 0; i < 16; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
if (isNonZero) {
if (First) {
if (NumZero)
V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
else
V = DAG.getUNDEF(MVT::v16i8);
First = false;
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
MVT::v16i8, V, Op.getOperand(i),
DAG.getIntPtrConstant(i));
}
}
return V;
}
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {

View File

@ -100,37 +100,29 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
;
; CHECK-WIDE-LABEL: foo3_8:
; CHECK-WIDE: ## BB#0:
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
; CHECK-WIDE-NEXT: movzbl %dl, %edx
; CHECK-WIDE-NEXT: orl %eax, %edx
; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1
; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: retl
%res = fptosi <8 x float> %src to <8 x i8>
@ -145,21 +137,17 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
;
; CHECK-WIDE-LABEL: foo3_4:
; CHECK-WIDE: ## BB#0:
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
; CHECK-WIDE-NEXT: movzbl %dl, %edx
; CHECK-WIDE-NEXT: orl %eax, %edx
; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i8>
ret <4 x i8> %res

View File

@ -651,18 +651,30 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
}
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE: # BB#0:
; SSE-NEXT: shll $8, %edi
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $2, %edi, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE2: # BB#0:
; SSE2-NEXT: shll $8, %edi
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $2, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSSE3: # BB#0:
; SSSE3-NEXT: shll $8, %edi
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE41: # BB#0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $5, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
; AVX-NEXT: shll $8, %edi
; AVX-NEXT: vpxor %xmm0, %xmm0
; AVX-NEXT: vpinsrw $2, %edi, %xmm0
; AVX-NEXT: vpinsrb $5, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -670,18 +682,30 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
}
define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE: # BB#0:
; SSE-NEXT: shll $8, %edi
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $7, %edi, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE2: # BB#0:
; SSE2-NEXT: shll $8, %edi
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $7, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSSE3: # BB#0:
; SSSE3-NEXT: shll $8, %edi
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE41: # BB#0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $15, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; AVX: # BB#0:
; AVX-NEXT: shll $8, %edi
; AVX-NEXT: vpxor %xmm0, %xmm0
; AVX-NEXT: vpinsrw $7, %edi, %xmm0
; AVX-NEXT: vpinsrb $15, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
@ -689,18 +713,30 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(
}
define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE: # BB#0:
; SSE-NEXT: movzbl %dil, %eax
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $1, %eax, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE2: # BB#0:
; SSE2-NEXT: movzbl %dil, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $1, %eax, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSSE3: # BB#0:
; SSSE3-NEXT: movzbl %dil, %eax
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE41: # BB#0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
; AVX-NEXT: movzbl %dil, %eax
; AVX-NEXT: vpxor %xmm0, %xmm0
; AVX-NEXT: vpinsrw $1, %eax, %xmm0
; AVX-NEXT: vpinsrb $2, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 3
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>