[X86] lowerShuffleAsVALIGN - extend to recognize basic shifted element masks

Try to use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ
This commit is contained in:
Simon Pilgrim 2023-08-30 18:32:40 +01:00
parent d3d71b8d5b
commit 967d95382d
3 changed files with 48 additions and 28 deletions

View File

@ -11214,6 +11214,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
/// rotate* of the vector lanes.
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
@ -11225,11 +11226,40 @@ static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
SDValue Lo = V1, Hi = V2;
int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
if (Rotation <= 0)
if (0 < Rotation)
return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
DAG.getTargetConstant(Rotation, DL, MVT::i8));
// See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
// TODO: Pull this out as a matchShuffleAsElementShift helper?
// TODO: We can probably make this more aggressive and use shift-pairs like
// lowerShuffleAsByteShiftMask.
unsigned NumElts = Mask.size();
unsigned ZeroLo = Zeroable.countr_one();
unsigned ZeroHi = Zeroable.countl_one();
assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
if (!ZeroLo && !ZeroHi)
return SDValue();
return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
DAG.getTargetConstant(Rotation, DL, MVT::i8));
if (ZeroLo) {
SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
getZeroVector(VT, Subtarget, DAG, DL),
DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
}
if (ZeroHi) {
SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
return DAG.getNode(X86ISD::VALIGN, DL, VT,
getZeroVector(VT, Subtarget, DAG, DL), Src,
DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
}
return SDValue();
}
/// Try to lower a vector shuffle as a byte shift sequence.
@ -12625,7 +12655,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
Zeroable, Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
@ -12940,7 +12970,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
Zeroable, Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
@ -15674,7 +15704,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
Zeroable, Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
@ -15936,7 +15966,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
Zeroable, Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
@ -16585,7 +16615,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use VALIGN.
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
Subtarget, DAG))
Zeroable, Subtarget, DAG))
return Rotate;
// Try to use PALIGNR.
@ -16669,7 +16699,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use VALIGN.
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
Subtarget, DAG))
Zeroable, Subtarget, DAG))
return Rotate;
// Try to use byte rotation instructions.

View File

@ -139,9 +139,8 @@ define <16 x i32> @PR42819(ptr %a0) {
; AVX512-LABEL: PR42819:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqu (%rdi), %ymm0
; AVX512-NEXT: movw $-8192, %ax # imm = 0xE000
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: valignd {{.*#+}} zmm0 = zmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2]
; AVX512-NEXT: retq
%1 = load <8 x i32>, ptr %a0, align 4
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

View File

@ -1665,19 +1665,11 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
}
define <8 x i64> @shuffle_v8i64_34567zzz(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_34567zzz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,4,5,6,7,13,14,15]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_34567zzz:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,0,4,0,5,0,6,0,7,0,13,0,14,0,15,0]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
; ALL-LABEL: shuffle_v8i64_34567zzz:
; ALL: # %bb.0:
; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; ALL-NEXT: valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7],zmm1[0,1,2]
; ALL-NEXT: ret{{[l|q]}}
%valign = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8>
ret <8 x i64> %valign
}
@ -1685,9 +1677,8 @@ define <8 x i64> @shuffle_v8i64_34567zzz(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_zz012345(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_zz012345:
; ALL: # %bb.0:
; ALL-NEXT: movb $-4, %al
; ALL-NEXT: kmovw %eax, %k1
; ALL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; ALL-NEXT: valignq {{.*#+}} zmm0 = zmm1[6,7],zmm0[0,1,2,3,4,5]
; ALL-NEXT: ret{{[l|q]}}
%valign = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
ret <8 x i64> %valign