mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-17 23:44:43 +00:00
[x86] Teach the x86 vector shuffle lowering to detect mergable 128-bit
lanes. By special casing these we can often either reduce the total number of shuffles significantly or reduce the number of (high latency on Haswell) AVX2 shuffles that potentially cross 128-bit lanes. Even when these don't actually cross lanes, they have much higher latency to support that. Doing two of them and a blend is worse than doing a single insert across the 128-bit lanes to blend and then doing a single interleaved shuffle. While this seems like a narrow case, it kept cropping up on me and the difference is *huge* as you can see in many of the test cases. I first hit this trying to perfectly fix the interleaving shuffle patterns used by Halide for AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222533 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a5f4576510
commit
bd357588a1
@ -9984,6 +9984,104 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
|
||||
DAG.getConstant(PermMask, MVT::i8));
|
||||
}
|
||||
|
||||
/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
|
||||
/// shuffling each lane.
|
||||
///
|
||||
/// This will only succeed when the result of fixing the 128-bit lanes results
|
||||
/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
|
||||
/// each 128-bit lanes. This handles many cases where we can quickly blend away
|
||||
/// the lane crosses early and then use simpler shuffles within each lane.
|
||||
///
|
||||
/// FIXME: It might be worthwhile at some point to support this without
|
||||
/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
|
||||
/// in x86 only floating point has interesting non-repeating shuffles, and even
|
||||
/// those are still *marginally* more expensive.
|
||||
static SDValue lowerVectorShuffleByMerging128BitLanes(
|
||||
SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
|
||||
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
|
||||
assert(is128BitLaneCrossingShuffleMask(VT, Mask) &&
|
||||
"This is only useful when there are cross-128-bit-lane shuffles.");
|
||||
|
||||
int Size = Mask.size();
|
||||
int LaneSize = 128 / VT.getScalarSizeInBits();
|
||||
int NumLanes = Size / LaneSize;
|
||||
assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
|
||||
|
||||
// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
|
||||
// check whether the in-128-bit lane shuffles share a repeating pattern.
|
||||
SmallVector<int, 4> Lanes;
|
||||
Lanes.resize(NumLanes, -1);
|
||||
SmallVector<int, 4> InLaneMask;
|
||||
InLaneMask.resize(LaneSize, -1);
|
||||
for (int i = 0; i < Size; ++i) {
|
||||
if (Mask[i] < 0)
|
||||
continue;
|
||||
|
||||
int j = i / LaneSize;
|
||||
|
||||
if (Lanes[j] < 0) {
|
||||
// First entry we've seen for this lane.
|
||||
Lanes[j] = Mask[i] / LaneSize;
|
||||
} else if (Lanes[j] != Mask[i] / LaneSize) {
|
||||
// This doesn't match the lane selected previously!
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// Check that within each lane we have a consistent shuffle mask.
|
||||
int k = i % LaneSize;
|
||||
if (InLaneMask[k] < 0) {
|
||||
InLaneMask[k] = Mask[i] % LaneSize;
|
||||
} else if (InLaneMask[k] != Mask[i] % LaneSize) {
|
||||
// This doesn't fit a repeating in-lane mask.
|
||||
return SDValue();
|
||||
}
|
||||
}
|
||||
|
||||
// First shuffle the lanes into place.
|
||||
MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
|
||||
VT.getSizeInBits() / 64);
|
||||
SmallVector<int, 8> LaneMask;
|
||||
LaneMask.resize(NumLanes * 2, -1);
|
||||
for (int i = 0; i < NumLanes; ++i)
|
||||
if (Lanes[i] >= 0) {
|
||||
LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
|
||||
LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
|
||||
}
|
||||
|
||||
V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
|
||||
V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
|
||||
SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
|
||||
|
||||
// Cast it back to the type we actually want.
|
||||
LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
|
||||
|
||||
// Now do a simple shuffle that isn't lane crossing.
|
||||
SmallVector<int, 8> NewMask;
|
||||
NewMask.resize(Size, -1);
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0)
|
||||
NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
|
||||
assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
|
||||
"Must not introduce lane crosses at this point!");
|
||||
|
||||
return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
|
||||
}
|
||||
|
||||
/// \brief Test whether the specified input (0 or 1) is in-place blended by the
|
||||
/// given mask.
|
||||
///
|
||||
/// This returns true if the elements from a particular input are already in the
|
||||
/// slot required by the given mask and require no permutation.
|
||||
static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
|
||||
assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
|
||||
int Size = Mask.size();
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
|
||||
///
|
||||
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
|
||||
@ -10068,6 +10166,17 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
DAG.getConstant(SHUFPDMask, MVT::i8));
|
||||
}
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle. However, if we have AVX2 and either inputs are already in place,
|
||||
// we will be able to shuffle even across lanes the other input in a single
|
||||
// instruction so skip this pattern.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
|
||||
!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
|
||||
isShuffleMaskInputInPlace(1, Mask))))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
// If we have AVX2 then we always want to lower with a blend because an v4 we
|
||||
// can fully permute the elements.
|
||||
if (Subtarget->hasAVX2())
|
||||
@ -10138,6 +10247,17 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
|
||||
getV4X86ShuffleImm8ForMask(Mask, DAG));
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle. However, if we have AVX2 and either inputs are already in place,
|
||||
// we will be able to shuffle even across lanes the other input in a single
|
||||
// instruction so skip this pattern.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v4i64, Mask) &&
|
||||
!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
|
||||
isShuffleMaskInputInPlace(1, Mask))))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
// Otherwise fall back on generic blend lowering.
|
||||
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
|
||||
Mask, DAG);
|
||||
@ -10215,6 +10335,13 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
DAG);
|
||||
}
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
// If we have AVX2 then we always want to lower with a blend because at v8 we
|
||||
// can fully permute the elements.
|
||||
if (Subtarget->hasAVX2())
|
||||
@ -10278,6 +10405,13 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
|
||||
}
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v8i32, Mask))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
// Otherwise fall back on generic blend lowering.
|
||||
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
|
||||
Mask, DAG);
|
||||
@ -10305,9 +10439,17 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
|
||||
// There are no generalized cross-lane shuffle operations available on i16
|
||||
// element types.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (!isSingleInputShuffleMask(Mask))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
|
||||
Mask, DAG);
|
||||
}
|
||||
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
@ -10374,9 +10516,17 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
|
||||
// There are no generalized cross-lane shuffle operations available on i8
|
||||
// element types.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
|
||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
|
||||
Mask, DAG);
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
if (!isSingleInputShuffleMask(Mask))
|
||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
|
||||
return Result;
|
||||
|
||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
|
||||
DAG);
|
||||
}
|
||||
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
|
@ -182,20 +182,11 @@ entry:
|
||||
;;;; Cases we must not select vperm2f128
|
||||
|
||||
define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
|
||||
; AVX1-LABEL: G:
|
||||
; AVX1: ## BB#0: ## %entry
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: G:
|
||||
; AVX2: ## BB#0: ## %entry
|
||||
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: G:
|
||||
; ALL: ## BB#0: ## %entry
|
||||
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
|
||||
; ALL-NEXT: retq
|
||||
entry:
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
|
||||
ret <8 x float> %shuffle
|
||||
|
@ -1278,11 +1278,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
||||
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
|
||||
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
|
||||
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
|
||||
ret <16 x i16> %shuffle
|
||||
@ -1301,13 +1298,8 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
|
||||
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
||||
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
|
||||
ret <16 x i16> %shuffle
|
||||
@ -1327,13 +1319,8 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
||||
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
||||
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
|
||||
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
|
||||
ret <16 x i16> %shuffle
|
||||
|
@ -1572,10 +1572,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
|
||||
ret <32 x i8> %shuffle
|
||||
@ -1593,12 +1591,8 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
|
||||
ret <32 x i8> %shuffle
|
||||
@ -1617,12 +1611,8 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
|
||||
ret <32 x i8> %shuffle
|
||||
|
@ -301,58 +301,31 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
|
||||
}
|
||||
|
||||
define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: shuffle_v4f64_1054:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_1054:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,0]
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v4f64_1054:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
|
||||
ret <4 x double> %shuffle
|
||||
}
|
||||
|
||||
define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: shuffle_v4f64_3254:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_3254:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,0]
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v4f64_3254:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
|
||||
ret <4 x double> %shuffle
|
||||
}
|
||||
|
||||
define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: shuffle_v4f64_3276:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_3276:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v4f64_3276:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
|
||||
ret <4 x double> %shuffle
|
||||
}
|
||||
@ -687,16 +660,14 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
|
||||
define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_1054:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_1054:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,0]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
|
||||
ret <4 x i64> %shuffle
|
||||
@ -705,17 +676,14 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
|
||||
define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_3254:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_3254:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,0]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
|
||||
ret <4 x i64> %shuffle
|
||||
@ -724,18 +692,14 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
|
||||
define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_3276:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_3276:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
|
||||
ret <4 x i64> %shuffle
|
||||
|
@ -736,20 +736,11 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
|
||||
}
|
||||
|
||||
define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_3210ba98:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8f32_3210ba98:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
|
||||
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v8f32_3210ba98:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
|
||||
ret <8 x float> %shuffle
|
||||
}
|
||||
@ -774,43 +765,21 @@ define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
|
||||
}
|
||||
|
||||
define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_7654fedc:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8f32_7654fedc:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
|
||||
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v8f32_7654fedc:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
|
||||
ret <8 x float> %shuffle
|
||||
}
|
||||
|
||||
define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_fedc7654:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8f32_fedc7654:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
|
||||
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
; ALL-LABEL: shuffle_v8f32_fedc7654:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
|
||||
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
|
||||
ret <8 x float> %shuffle
|
||||
}
|
||||
@ -1789,17 +1758,14 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
|
||||
define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_3210ba98:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_3210ba98:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
|
||||
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
|
||||
ret <8 x i32> %shuffle
|
||||
@ -1827,19 +1793,14 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
|
||||
define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_7654fedc:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_7654fedc:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
|
||||
; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
|
||||
ret <8 x i32> %shuffle
|
||||
@ -1848,19 +1809,14 @@ define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
|
||||
define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_fedc7654:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_fedc7654:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
|
||||
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
|
||||
ret <8 x i32> %shuffle
|
||||
|
@ -208,15 +208,13 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
|
||||
define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
|
||||
; ALL-LABEL: shuffle_v8f64_9810dc54:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
|
||||
; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,0]
|
||||
; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
|
||||
; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
|
||||
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
|
||||
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
|
||||
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
|
||||
; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
|
||||
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
|
||||
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
|
||||
ret <8 x double> %shuffle
|
||||
@ -930,15 +928,13 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
|
||||
define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
|
||||
; ALL-LABEL: shuffle_v8i64_9810dc54:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
|
||||
; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,0]
|
||||
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
|
||||
; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
|
||||
; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
|
||||
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,0]
|
||||
; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
|
||||
; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
|
||||
; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
|
||||
; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
|
||||
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
|
||||
; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
||||
; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
|
||||
; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
|
||||
; ALL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
|
||||
ret <8 x i64> %shuffle
|
||||
|
@ -2463,19 +2463,16 @@ define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: combine_unneeded_subvector2:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
|
||||
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: retq
|
||||
%c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
||||
%d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
|
||||
|
Loading…
x
Reference in New Issue
Block a user