[X86] lower1BitShuffle - fold permute(setcc(x,y)) -> setcc(permute(x),permute(y)) for 32/64-bit element vectors

Noticed in #77459 - for wider element types, its usually better to pre-shuffle the comparison arguments if we can, like we already for broadcasts
This commit is contained in:
Simon Pilgrim 2024-01-10 12:35:39 +00:00
parent 7775375003
commit cc21aa1922
3 changed files with 22 additions and 36 deletions

View File

@ -17224,6 +17224,7 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
"Cannot lower 512-bit vectors w/o basic ISA!");
int NumElts = Mask.size();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
// Try to recognize shuffles that are just padding a subvector with zeros.
int SubvecElts = 0;
@ -17289,17 +17290,18 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Offset += NumElts; // Increment for next iteration.
}
// If we're broadcasting a SETCC result, try to broadcast the ops instead.
// If we're performing an unary shuffle on a SETCC result, try to shuffle the
// ops instead.
// TODO: What other unary shuffles would benefit from this?
if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
V1->hasOneUse()) {
if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
SDValue Op0 = V1.getOperand(0);
SDValue Op1 = V1.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
EVT OpVT = Op0.getValueType();
return DAG.getSetCC(
DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
return DAG.getSetCC(
DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
}
MVT ExtVT;

View File

@ -42,10 +42,9 @@ define i4 @reverse_cmp_v4i1(<4 x i32> %a0, <4 x i32> %a1) {
;
; AVX512-LABEL: reverse_cmp_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2d %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512-NEXT: vpmovd2m %xmm0, %k0
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq

View File

@ -9,8 +9,6 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512F-LABEL: shuf2i1_1_0:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@ -21,19 +19,15 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512VL-LABEL: shuf2i1_1_0:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf2i1_1_0:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
@ -86,10 +80,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
; AVX512F-LABEL: shuf4i1_3_2_10:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
@ -98,21 +90,17 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
;
; AVX512VL-LABEL: shuf4i1_3_2_10:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: retq
@ -123,11 +111,10 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@ -136,12 +123,11 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
;
; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; AVX512VL-NEXT: vpermq %zmm2, %zmm1, %zmm2
; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@ -149,11 +135,10 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
;
; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermq %zmm2, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq