diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 565decb601b..bb1766c5947 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3861,6 +3861,9 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { case X86ISD::VPERMILPV: case X86ISD::VPERMIL2: case X86ISD::VPPERM: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: + case X86ISD::VPERMIV3: return true; } } @@ -25529,16 +25532,17 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. + // FIXME: Add AVX512BWVL support for v16i16. if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros && - Subtarget.hasAVX2() && (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) { + ((Subtarget.hasAVX2() && + (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16))) { MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - SmallVector VPermIdx; - for (int M : Mask) - VPermIdx.push_back(M < 0 ? DAG.getUNDEF(VPermMaskSVT) - : DAG.getConstant(M, DL, VPermMaskSVT)); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); + SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 230a817dbf5..472e14bc571 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -833,17 +833,15 @@ define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) { ret <32 x i16> %res0 } -define <32 x i16> @combine_pshufb_as_pshufw(<32 x i16> %a0) { -; X32-LABEL: combine_pshufb_as_pshufw: +define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) { +; X32-LABEL: combine_vpermi2var_32i16_as_pshufb: ; X32: # BB#0: -; X32-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31] -; X32-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30] +; X32-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61] ; X32-NEXT: retl ; -; X64-LABEL: combine_pshufb_as_pshufw: +; X64-LABEL: combine_vpermi2var_32i16_as_pshufb: ; X64: # BB#0: -; X64-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31] -; X64-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30] +; X64-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61] ; X64-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> , <32 x i16> undef, i32 -1) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %res0, <32 x i16> , <32 x i16> undef, i32 -1) @@ -918,3 +916,90 @@ define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> , <32 x i16> %res0, i32 -1) ret <32 x i16> %res1 } + +define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) { +; X32-LABEL: combine_vpermi2var_8f64_as_vpermpd: +; X32: # BB#0: +; X32-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd: +; X64: # BB#0: +; X64-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> , <8 x double> %res0, i8 -1) + ret <8 x double> %res1 +} + +define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) { +; X32-LABEL: combine_vpermt2var_8i64_as_vpermq: +; X32: # BB#0: +; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0] +; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,0,5,0,14,0,7,0,8,0,1,0,10,0,3,0] +; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: +; X64: # BB#0: +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 -1) + ret <8 x i64> %res1 +} + +define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) { +; X32-LABEL: combine_vpermi2var_16f32_as_vpermps: +; X32: # BB#0: +; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; X32-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_16f32_as_vpermps: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x1, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> , <16 x float> %res0, i16 -1) + ret <16 x float> %res1 +} + +define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { +; X32-LABEL: combine_vpermt2var_16i32_as_vpermd: +; X32: # BB#0: +; X32-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; X32-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermt2var_16i32_as_vpermd: +; X64: # BB#0: +; X64-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 -1) + ret <16 x i32> %res1 +} + +define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { +; X32-LABEL: combine_vpermi2var_32i16_as_permw: +; X32: # BB#0: +; X32-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] +; X32-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_32i16_as_permw: +; X64: # BB#0: +; X64-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] +; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> , <32 x i16> %x1, i32 -1) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> , <32 x i16> %res0, i32 -1) + ret <32 x i16> %res1 +}