[x86] Teach the new vector shuffle lowering about VBROADCAST and

VPBROADCAST.

This has the somewhat expected pervasive impact. I don't know why
I forgot about this. Everything seems good with lots of significant
improvements in the tests.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218724 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chandler Carruth 2014-10-01 00:41:21 +00:00
parent a7ac986332
commit afe75172b1
10 changed files with 437 additions and 263 deletions

View File

@ -7824,6 +7824,57 @@ static SDValue lowerVectorShuffleAsElementInsertion(
return V2;
}
/// \brief Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
if (!Subtarget->hasAVX())
return SDValue();
if (VT.isInteger() && !Subtarget->hasAVX2())
return SDValue();
// Check that the mask is a broadcast.
int BroadcastIdx = -1;
for (int M : Mask)
if (M >= 0 && BroadcastIdx == -1)
BroadcastIdx = M;
else if (M >= 0 && M != BroadcastIdx)
return SDValue();
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
// Check if this is a broadcast of a scalar load -- those are more widely
// supported than broadcasting in-register values.
if (V.getOpcode() == ISD::BUILD_VECTOR ||
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
SDValue BroadcastV = V.getOperand(BroadcastIdx);
if (ISD::isNON_EXTLoad(BroadcastV.getNode())) {
// We can directly broadcast from memory.
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, BroadcastV);
}
}
// We can't broadcast from a register w/o AVX2.
if (!Subtarget->hasAVX2())
return SDValue();
// Check if this is a broadcast of a BUILD_VECTOR which we can always handle,
// or is a broadcast of the zero element.
if (V.getOpcode() == ISD::BUILD_VECTOR)
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, V.getOperand(BroadcastIdx));
else if (BroadcastIdx != 0)
return SDValue();
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@ -7900,6 +7951,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
@ -8057,6 +8113,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
@ -8157,6 +8218,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We coerce the shuffle pattern to be compatible with UNPCK instructions
@ -8253,6 +8319,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
Mask, Subtarget, DAG))
return Broadcast;
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
@ -9036,6 +9107,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
// Notably, this handles splat and partial-splat shuffles more efficiently.
// However, it only makes sense if the pre-duplication shuffle simplifies
@ -9455,6 +9531,11 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowerid with an
// interleaved permutation.
@ -9538,6 +9619,11 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// When the shuffle is mirrored between the 128-bit lanes of the unit, we can
// use lower latency instructions that will operate on both 128-bit lanes.
SmallVector<int, 2> RepeatedMask;
@ -9592,6 +9678,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
@ -9665,6 +9756,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the two 128-bit
// lanes.
@ -9714,6 +9810,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
@ -9779,6 +9880,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
Mask, Subtarget, DAG))
return Broadcast;
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))

View File

@ -8711,6 +8711,27 @@ let Predicates = [HasAVX2] in {
def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
(VBROADCASTSDYrr VR128:$src)>;
// Provide aliases for broadcast from the same regitser class that
// automatically does the extract.
def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
(VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src),
sub_xmm)))>;
def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))),
(VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src),
sub_xmm)))>;
def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))),
(VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src),
sub_xmm)))>;
def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))),
(VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src),
sub_xmm)))>;
def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
(VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
sub_xmm)))>;
def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
(VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
sub_xmm)))>;
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
let AddedComplexity = 20 in {

View File

@ -36,11 +36,16 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX: # BB#0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX1: # BB#0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i8> %shuffle
}
@ -200,10 +205,15 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_0101010101010101:
; AVX: # BB#0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v16i8_0101010101010101:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i8_0101010101010101:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
ret <16 x i8> %shuffle
}
@ -231,12 +241,18 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX: # BB#0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX1: # BB#0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
ret <16 x i8> %shuffle
}

View File

@ -14,10 +14,15 @@ define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_00:
; AVX: # BB#0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v2i64_00:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
ret <2 x i64> %shuffle
}
@ -53,10 +58,15 @@ define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_22:
; AVX: # BB#0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v2i64_22:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_22:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0
; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
ret <2 x i64> %shuffle
}

View File

@ -77,10 +77,15 @@ define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_00000000:
; AVX: # BB#0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v8i16_00000000:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i16_00000000:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i16> %shuffle
}
@ -1169,12 +1174,19 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_XXXXcde3:
; AVX: # BB#0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
@ -1205,12 +1217,19 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_cde3XXXX:
; AVX: # BB#0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v8i16_cde3XXXX:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i16_cde3XXXX:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@ -1254,16 +1273,27 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_012dcde3:
; AVX: # BB#0:
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v8i16_012dcde3:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i16_012dcde3:
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}

View File

@ -12,8 +12,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@ -31,10 +30,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
@ -51,7 +48,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -69,7 +66,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -87,7 +84,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -105,7 +102,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -123,7 +120,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -141,7 +138,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -165,7 +162,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -189,7 +186,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,u,u,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -214,7 +211,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,u,u,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -239,7 +236,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,u,u,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -264,7 +261,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,u,u,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -289,7 +286,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -315,7 +312,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -341,7 +338,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,0,1,0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -735,8 +732,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_1
;
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq

View File

@ -13,9 +13,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -32,8 +30,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -52,8 +49,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -72,8 +68,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -92,8 +87,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -112,8 +106,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -132,8 +125,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -152,8 +144,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -172,8 +163,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -192,8 +182,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -212,8 +201,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -232,8 +220,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -252,8 +239,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -272,8 +258,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -292,8 +277,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@ -317,8 +301,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: movl $15, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -339,12 +322,12 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -364,12 +347,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -389,12 +371,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -414,12 +395,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -439,12 +419,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -464,12 +443,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -489,12 +467,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -514,12 +491,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -539,12 +515,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -564,12 +539,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -589,12 +563,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -614,12 +587,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -639,12 +611,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -664,12 +635,11 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -689,12 +659,11 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@ -722,7 +691,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vinserti128 $0, %xmm2, %ymm3, %ymm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -1037,9 +1006,10 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
;
; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq

View File

@ -12,7 +12,7 @@ define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
;
; AVX2-LABEL: shuffle_v4f64_0000:
; AVX2: # BB#0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0]
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x double> %shuffle
@ -184,11 +184,17 @@ define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
}
define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0423:
; ALL: # BB#0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; ALL-NEXT: retq
; AVX1-LABEL: shuffle_v4f64_0423:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0423:
; AVX2: # BB#0:
; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x double> %shuffle
}
@ -317,7 +323,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_0000:
; AVX2: # BB#0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x i64> %shuffle
@ -460,7 +466,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_0124:
; AVX2: # BB#0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
@ -477,8 +483,8 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_0142:
; AVX2: # BB#0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
@ -497,8 +503,8 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_0412:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
@ -731,3 +737,31 @@ define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x double> %shuffle
}
define <4 x double> @splat_mem_v4f64(double* %ptr) {
; ALL-LABEL: splat_mem_v4f64:
; ALL: # BB#0:
; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
; ALL-NEXT: retq
%a = load double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x double> %shuffle
}
define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
; AVX1-LABEL: splat_mem_v4i64:
; AVX1: # BB#0:
; AVX1-NEXT: vmovddup (%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat_mem_v4i64:
; AVX2: # BB#0:
; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: retq
%a = load i64* %ptr
%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x i64> %shuffle
}

View File

@ -12,8 +12,7 @@ define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00000000:
; AVX2: # BB#0:
; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x float> %shuffle
@ -210,9 +209,8 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_08080808:
; AVX2: # BB#0:
; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
@ -702,8 +700,7 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-LABEL: shuffle_v8i32_00000000:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
@ -910,9 +907,8 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-LABEL: shuffle_v8i32_08080808:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>

View File

@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown"
define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00000000:
; ALL: # BB#0:
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -16,7 +16,7 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00000010:
; ALL: # BB#0:
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -27,7 +27,7 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00000200:
; ALL: # BB#0:
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -38,7 +38,7 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00003000:
; ALL: # BB#0:
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -50,10 +50,9 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00040000:
; ALL: # BB#0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,0]
; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,0,3]
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
@ -65,9 +64,8 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,0]
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -79,9 +77,8 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,0]
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -93,7 +90,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -127,7 +124,7 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00001111:
; ALL: # BB#0:
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -151,8 +148,8 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08080808:
; ALL: # BB#0:
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,0]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
@ -164,12 +161,12 @@ define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08084c4c:
; ALL: # BB#0:
; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,0]
; ALL-NEXT: vbroadcastsd %xmm2, %ymm2
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,0]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
@ -182,9 +179,9 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
@ -513,7 +510,7 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
; ALL: # BB#0:
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@ -638,7 +635,7 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00uu66uu:
; ALL: # BB#0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
@ -694,16 +691,16 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_c348cda0:
; ALL: # BB#0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,3,2,3]
; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3]
; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,3,2,3]
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3]
; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,2,0]
; ALL-NEXT: vbroadcastsd %xmm1, %ymm4
; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3]
; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3]
; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
@ -734,7 +731,7 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00000000:
; ALL: # BB#0:
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -744,7 +741,7 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00000010:
; ALL: # BB#0:
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -755,7 +752,7 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00000200:
; ALL: # BB#0:
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -766,7 +763,7 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00003000:
; ALL: # BB#0:
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -778,10 +775,9 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00040000:
; ALL: # BB#0:
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,3]
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
@ -793,9 +789,8 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,0]
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -807,9 +802,8 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,0]
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@ -821,7 +815,7 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -855,7 +849,7 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00001111:
; ALL: # BB#0:
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
@ -879,8 +873,8 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08080808:
; ALL: # BB#0:
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,0]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
@ -892,12 +886,12 @@ define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08084c4c:
; ALL: # BB#0:
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,0]
; ALL-NEXT: vpbroadcastq %xmm2, %ymm2
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3]
; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,0]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
@ -910,9 +904,9 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
; ALL: # BB#0:
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
@ -1241,7 +1235,7 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
; ALL: # BB#0:
; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,0]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@ -1366,7 +1360,7 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00uu66uu:
; ALL: # BB#0:
; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@ -1427,8 +1421,8 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,3,2,1]
; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
; ALL-NEXT: vpbroadcastq %xmm2, %ymm2
; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]