mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-25 20:59:51 +00:00
[x86] Add initial basic support for forming blends of v16i8 vectors.
This blend instruction is ... really lame. The register usage is insane. As a consequence this is probably only *barely* better than 2 pshufbs followed by a por, and that mostly because it only has to read from a single memory location. However, this doesn't fix as much as I kind of expected, so more to go. Pretty sure that the ordering and delegation of v16i8 is just really, really bad. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229373 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ab497238cb
commit
29679ccc12
@ -7525,11 +7525,14 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
|
||||
}
|
||||
}
|
||||
// FALLTHROUGH
|
||||
case MVT::v16i8:
|
||||
case MVT::v32i8: {
|
||||
assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
|
||||
// Scale the blend by the number of bytes per element.
|
||||
int Scale = VT.getScalarSizeInBits() / 8;
|
||||
assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
|
||||
int Scale = VT.getScalarSizeInBits() / 8;
|
||||
|
||||
// This form of blend is always done on bytes. Compute the byte vector
|
||||
// type.
|
||||
MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
|
||||
|
||||
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
|
||||
// mix of LLVM's code generator and the x86 backend. We tell the code
|
||||
@ -7542,19 +7545,19 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
|
||||
// the LLVM model for boolean values in vector elements gets the relevant
|
||||
// bit set, it is set backwards and over constrained relative to x86's
|
||||
// actual model.
|
||||
SDValue VSELECTMask[32];
|
||||
SmallVector<SDValue, 32> VSELECTMask;
|
||||
for (int i = 0, Size = Mask.size(); i < Size; ++i)
|
||||
for (int j = 0; j < Scale; ++j)
|
||||
VSELECTMask[Scale * i + j] =
|
||||
VSELECTMask.push_back(
|
||||
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
|
||||
: DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
|
||||
: DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
|
||||
|
||||
V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
|
||||
V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
|
||||
V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
|
||||
V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
|
||||
return DAG.getNode(
|
||||
ISD::BITCAST, DL, VT,
|
||||
DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
|
||||
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
|
||||
DAG.getNode(ISD::VSELECT, DL, BlendVT,
|
||||
DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
|
||||
V1, V2));
|
||||
}
|
||||
|
||||
@ -9969,6 +9972,15 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
}
|
||||
}
|
||||
|
||||
// If both V1 and V2 are in use and we can use a direct blend, do so. This
|
||||
// avoids using blends to handle blends-with-zero which is important as
|
||||
// a single pshufb is significantly faster for that.
|
||||
if (V1InUse && V2InUse && Subtarget->hasSSE41())
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
|
||||
if (V1InUse)
|
||||
V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
|
||||
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
|
||||
|
@ -530,16 +530,16 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
|
||||
; SSE41-NEXT: por %xmm1, %xmm0
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
|
||||
; SSE41-NEXT: pblendvb %xmm2, %xmm1
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
|
||||
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
|
||||
; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
|
||||
ret <16 x i8> %shuffle
|
||||
|
@ -1492,27 +1492,25 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_
|
||||
define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
|
||||
; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u]
|
||||
; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
|
||||
; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15]
|
||||
; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
|
||||
; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
|
||||
; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,3,u,3,u,u,u,u,u,u,u],zero,xmm5[u,u,u,u]
|
||||
; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
|
||||
; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
|
||||
; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15]
|
||||
; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u],zero,zero,xmm5[u,u,u,u,1,6,13,u,u],zero,xmm5[u,u]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
|
||||
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
|
||||
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
|
||||
|
Loading…
Reference in New Issue
Block a user