diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index dc46d0e499e..0bc6de0af79 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1547,6 +1547,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); @@ -18967,6 +18968,35 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } +/// Break a 512-bit integer operation into two new 256-bit ones and then +/// concatenate the result back. +static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + + assert(VT.is512BitVector() && VT.isInteger() && + "Unsupported value type for operation"); + + unsigned NumElems = VT.getVectorNumElements(); + SDLoc dl(Op); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl); + + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); +} + static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { if (Op.getValueType() == MVT::i1) return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), @@ -19009,10 +19039,15 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - // Lower v16i8/v32i8 mul as sign-extension to v8i16/v16i16 vector - // pairs, multiply and truncate. - if (VT == MVT::v16i8 || VT == MVT::v32i8) { + // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 + // vector pairs, multiply and truncate. + if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { if (Subtarget.hasInt256()) { + // For 512-bit vectors, split into 256-bit vectors to allow the + // sign-extension to occur. + if (VT == MVT::v64i8) + return Lower512IntArith(Op, DAG); + // For 256-bit vectors, split into 128-bit vectors to allow the // sign-extension to occur. We don't need this on AVX512BW as we can // safely sign-extend to v32i16. diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 7370aa31fe6..657d51c05bd 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -929,269 +929,16 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; ; AVX512BW-LABEL: mul_v64i8c: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax -; AVX512BW-NEXT: movb $117, %cl -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %edx -; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm2 -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %edx -; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm3 -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %edx -; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm3 -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2 -; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %edx -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm3 -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq entry: %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > @@ -1388,335 +1135,17 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %ecx -; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx -; AVX512BW-NEXT: mulb %dl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax -; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4 -; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %ecx -; AVX512BW-NEXT: vpextrb $0, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: mulb %dl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm5 -; AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax -; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4 -; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %ecx -; AVX512BW-NEXT: vpextrb $0, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: mulb %dl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm5 -; AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax -; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %ecx -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $0, %xmm1, %edx -; AVX512BW-NEXT: mulb %dl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512BW-NEXT: mulb %cl -; AVX512BW-NEXT: movzbl %al, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3 +; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq entry: %A = mul <64 x i8> %i, %j