mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-02 00:16:25 +00:00
[X86][SSE] Use (V)PINSRB for direct byte insertion in 16i8 buildvector on SSE4.1 targets
This patch allows SSE4.1 targets to use (V)PINSRB to create 16i8 vectors by inserting i8 scalars directly into a XMM register instead of merging pairs of i8 scalars into a i16 and using the SSE2 PINSRW instruction. This allows folding of byte loads and reduces scalar register usage as well. Differential Revision: http://reviews.llvm.org/D8839 llvm-svn: 234193
This commit is contained in:
parent
d56c59218e
commit
a5a17c0b23
@ -4460,6 +4460,29 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
|
||||
SDLoc dl(Op);
|
||||
SDValue V;
|
||||
bool First = true;
|
||||
|
||||
// SSE4.1 - use PINSRB to insert each byte directly.
|
||||
if (Subtarget->hasSSE41()) {
|
||||
for (unsigned i = 0; i < 16; ++i) {
|
||||
bool isNonZero = (NonZeros & (1 << i)) != 0;
|
||||
if (isNonZero) {
|
||||
if (First) {
|
||||
if (NumZero)
|
||||
V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
|
||||
else
|
||||
V = DAG.getUNDEF(MVT::v16i8);
|
||||
First = false;
|
||||
}
|
||||
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
|
||||
MVT::v16i8, V, Op.getOperand(i),
|
||||
DAG.getIntPtrConstant(i));
|
||||
}
|
||||
}
|
||||
|
||||
return V;
|
||||
}
|
||||
|
||||
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
|
||||
for (unsigned i = 0; i < 16; ++i) {
|
||||
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
|
||||
if (ThisIsNonZero && First) {
|
||||
|
@ -100,37 +100,29 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
|
||||
;
|
||||
; CHECK-WIDE-LABEL: foo3_8:
|
||||
; CHECK-WIDE: ## BB#0:
|
||||
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
|
||||
; CHECK-WIDE-NEXT: shll $8, %eax
|
||||
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
|
||||
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
|
||||
; CHECK-WIDE-NEXT: orl %eax, %ecx
|
||||
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
|
||||
; CHECK-WIDE-NEXT: shll $8, %eax
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
|
||||
; CHECK-WIDE-NEXT: movzbl %dl, %edx
|
||||
; CHECK-WIDE-NEXT: orl %eax, %edx
|
||||
; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
|
||||
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
|
||||
; CHECK-WIDE-NEXT: shll $8, %eax
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
|
||||
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
|
||||
; CHECK-WIDE-NEXT: orl %eax, %ecx
|
||||
; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
||||
; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
|
||||
; CHECK-WIDE-NEXT: shll $8, %eax
|
||||
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
|
||||
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
|
||||
; CHECK-WIDE-NEXT: orl %eax, %ecx
|
||||
; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
|
||||
; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
|
||||
; CHECK-WIDE-NEXT: vzeroupper
|
||||
; CHECK-WIDE-NEXT: retl
|
||||
%res = fptosi <8 x float> %src to <8 x i8>
|
||||
@ -145,21 +137,17 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
|
||||
;
|
||||
; CHECK-WIDE-LABEL: foo3_4:
|
||||
; CHECK-WIDE: ## BB#0:
|
||||
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
|
||||
; CHECK-WIDE-NEXT: shll $8, %eax
|
||||
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
|
||||
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
|
||||
; CHECK-WIDE-NEXT: orl %eax, %ecx
|
||||
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
|
||||
; CHECK-WIDE-NEXT: shll $8, %eax
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
|
||||
; CHECK-WIDE-NEXT: movzbl %dl, %edx
|
||||
; CHECK-WIDE-NEXT: orl %eax, %edx
|
||||
; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0
|
||||
; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
|
||||
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
|
||||
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
|
||||
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
|
||||
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
|
||||
; CHECK-WIDE-NEXT: retl
|
||||
%res = fptosi <4 x float> %src to <4 x i8>
|
||||
ret <4 x i8> %res
|
||||
|
@ -651,18 +651,30 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
|
||||
}
|
||||
|
||||
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
|
||||
; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: shll $8, %edi
|
||||
; SSE-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE-NEXT: pinsrw $2, %edi, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
|
||||
; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: shll $8, %edi
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: pinsrw $2, %edi, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: shll $8, %edi
|
||||
; SSSE3-NEXT: pxor %xmm0, %xmm0
|
||||
; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE41-NEXT: pinsrb $5, %edi, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: shll $8, %edi
|
||||
; AVX-NEXT: vpxor %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrw $2, %edi, %xmm0
|
||||
; AVX-NEXT: vpinsrb $5, %edi, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%a = insertelement <16 x i8> undef, i8 %i, i32 0
|
||||
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
||||
@ -670,18 +682,30 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
|
||||
}
|
||||
|
||||
define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
|
||||
; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: shll $8, %edi
|
||||
; SSE-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE-NEXT: pinsrw $7, %edi, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: shll $8, %edi
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: pinsrw $7, %edi, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: shll $8, %edi
|
||||
; SSSE3-NEXT: pxor %xmm0, %xmm0
|
||||
; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE41-NEXT: pinsrb $15, %edi, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: shll $8, %edi
|
||||
; AVX-NEXT: vpxor %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrw $7, %edi, %xmm0
|
||||
; AVX-NEXT: vpinsrb $15, %edi, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%a = insertelement <16 x i8> undef, i8 %i, i32 0
|
||||
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
|
||||
@ -689,18 +713,30 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(
|
||||
}
|
||||
|
||||
define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
|
||||
; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movzbl %dil, %eax
|
||||
; SSE-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE-NEXT: pinsrw $1, %eax, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movzbl %dil, %eax
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: pinsrw $1, %eax, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movzbl %dil, %eax
|
||||
; SSSE3-NEXT: pxor %xmm0, %xmm0
|
||||
; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: movzbl %dil, %eax
|
||||
; AVX-NEXT: vpxor %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrw $1, %eax, %xmm0
|
||||
; AVX-NEXT: vpinsrb $2, %edi, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%a = insertelement <16 x i8> undef, i8 %i, i32 3
|
||||
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
|
Loading…
Reference in New Issue
Block a user