From 373eadc326b8a41d24866c12155e26d78b8ec831 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Dec 2016 20:00:10 +0000 Subject: [PATCH] [X86][SSE] Improve lowering of vXi64 multiplies As mentioned on PR30845, we were performing our vXi64 multiplication as: AloBlo = pmuludq(a, b); AloBhi = pmuludq(a, psrlqi(b, 32)); AhiBlo = pmuludq(psrlqi(a, 32), b); return AloBlo + psllqi(AloBhi, 32)+ psllqi(AhiBlo, 32); when we could avoid one of the upper shifts with: AloBlo = pmuludq(a, b); AloBhi = pmuludq(a, psrlqi(b, 32)); AhiBlo = pmuludq(psrlqi(a, 32), b); return AloBlo + psllqi(AloBhi + AhiBlo, 32); This matches the lowering on gcc/icc. Differential Revision: https://reviews.llvm.org/D27756 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290267 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 45 ++- lib/Target/X86/X86TargetTransformInfo.cpp | 16 +- test/Analysis/CostModel/X86/arith.ll | 48 +-- test/CodeGen/X86/avx-arith.ll | 22 +- test/CodeGen/X86/avx512-arith.ll | 99 +++-- test/CodeGen/X86/combine-mul.ll | 53 ++- test/CodeGen/X86/pmul.ll | 171 ++++---- test/CodeGen/X86/shrink_vmul.ll | 5 +- test/CodeGen/X86/vector-trunc-math.ll | 453 ++++++++++------------ 9 files changed, 426 insertions(+), 486 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a25514f28ef..aa907dd9bd7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -20671,13 +20671,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // AloBlo = pmuludq(a, b); // AloBhi = pmuludq(a, Bhi); // AhiBlo = pmuludq(Ahi, b); - - // AloBhi = psllqi(AloBhi, 32); - // AhiBlo = psllqi(AhiBlo, 32); - // return AloBlo + AloBhi + AhiBlo; + // + // Hi = psllqi(AloBhi + AhiBlo, 32); + // return AloBlo + Hi; APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); - bool ALoiIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask); - bool BLoiIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask); + bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask); + bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask); APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask); @@ -20687,29 +20686,31 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDValue Alo = DAG.getBitcast(MulVT, A); SDValue Blo = DAG.getBitcast(MulVT, B); - SDValue Res; + SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); // Only multiply lo/hi halves that aren't known to be zero. - if (!ALoiIsZero && !BLoiIsZero) - Res = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo); + SDValue AloBlo = Zero; + if (!ALoIsZero && !BLoIsZero) + AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo); - if (!ALoiIsZero && !BHiIsZero) { + SDValue AloBhi = Zero; + if (!ALoIsZero && !BHiIsZero) { SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); Bhi = DAG.getBitcast(MulVT, Bhi); - SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi); - AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); - Res = (Res.getNode() ? DAG.getNode(ISD::ADD, dl, VT, Res, AloBhi) : AloBhi); + AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi); } - if (!AHiIsZero && !BLoiIsZero) { + SDValue AhiBlo = Zero; + if (!AHiIsZero && !BLoIsZero) { SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); Ahi = DAG.getBitcast(MulVT, Ahi); - SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo); - AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); - Res = (Res.getNode() ? DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo) : AhiBlo); + AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo); } - return (Res.getNode() ? Res : getZeroVector(VT, Subtarget, DAG, dl)); + SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo); + Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG); + + return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, @@ -30011,6 +30012,14 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // Shift N0 by zero -> N0. + if (!ShiftVal) + return N->getOperand(0); + + // Shift zero -> zero. + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // We can decode 'whole byte' logical bit shifts as shuffles. if ((ShiftVal.getZExtValue() % 8) == 0) { SDValue Op(N, 0); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 6b5b5a1528e..db563c08632 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -551,11 +551,11 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SUB, MVT::v4i64, 4 }, { ISD::ADD, MVT::v4i64, 4 }, // A v4i64 multiply is custom lowered as two split v2i64 vectors that then - // are lowered as a series of long multiplies(3), shifts(4) and adds(2) + // are lowered as a series of long multiplies(3), shifts(3) and adds(2) // Because we believe v4i64 to be a legal type, we must also include the - // split factor of two in the cost table. Therefore, the cost here is 18 - // instead of 9. - { ISD::MUL, MVT::v4i64, 18 }, + // split factor of two in the cost table. Therefore, the cost here is 16 + // instead of 8. + { ISD::MUL, MVT::v4i64, 16 }, }; // Look for AVX1 lowering tricks. @@ -569,10 +569,10 @@ int X86TTIImpl::getArithmeticInstrCost( // Custom lowering of vectors. static const CostTblEntry CustomLowered[] = { // A v2i64/v4i64 and multiply is custom lowered as a series of long - // multiplies(3), shifts(4) and adds(2). - { ISD::MUL, MVT::v2i64, 9 }, - { ISD::MUL, MVT::v4i64, 9 }, - { ISD::MUL, MVT::v8i64, 9 } + // multiplies(3), shifts(3) and adds(2). + { ISD::MUL, MVT::v2i64, 8 }, + { ISD::MUL, MVT::v4i64, 8 }, + { ISD::MUL, MVT::v8i64, 8 } }; if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) return LT.first * Entry->Cost; diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll index 55766e25cc4..7319efb413d 100644 --- a/test/Analysis/CostModel/X86/arith.ll +++ b/test/Analysis/CostModel/X86/arith.ll @@ -426,28 +426,28 @@ define i32 @and(i32 %arg) { ; CHECK-LABEL: 'mul' define i32 @mul(i32 %arg) { - ; SSSE3: cost of 9 {{.*}} %A = mul - ; SSE42: cost of 9 {{.*}} %A = mul - ; AVX: cost of 9 {{.*}} %A = mul - ; AVX2: cost of 9 {{.*}} %A = mul - ; AVX512F: cost of 9 {{.*}} %A = mul - ; AVX512BW: cost of 9 {{.*}} %A = mul + ; SSSE3: cost of 8 {{.*}} %A = mul + ; SSE42: cost of 8 {{.*}} %A = mul + ; AVX: cost of 8 {{.*}} %A = mul + ; AVX2: cost of 8 {{.*}} %A = mul + ; AVX512F: cost of 8 {{.*}} %A = mul + ; AVX512BW: cost of 8 {{.*}} %A = mul ; AVX512DQ: cost of 1 {{.*}} %A = mul %A = mul <2 x i64> undef, undef - ; SSSE3: cost of 18 {{.*}} %B = mul - ; SSE42: cost of 18 {{.*}} %B = mul - ; AVX: cost of 18 {{.*}} %B = mul - ; AVX2: cost of 9 {{.*}} %B = mul - ; AVX512F: cost of 9 {{.*}} %B = mul - ; AVX512BW: cost of 9 {{.*}} %B = mul + ; SSSE3: cost of 16 {{.*}} %B = mul + ; SSE42: cost of 16 {{.*}} %B = mul + ; AVX: cost of 16 {{.*}} %B = mul + ; AVX2: cost of 8 {{.*}} %B = mul + ; AVX512F: cost of 8 {{.*}} %B = mul + ; AVX512BW: cost of 8 {{.*}} %B = mul ; AVX512DQ: cost of 1 {{.*}} %B = mul %B = mul <4 x i64> undef, undef - ; SSSE3: cost of 36 {{.*}} %C = mul - ; SSE42: cost of 36 {{.*}} %C = mul - ; AVX: cost of 36 {{.*}} %C = mul - ; AVX2: cost of 18 {{.*}} %C = mul - ; AVX512F: cost of 9 {{.*}} %C = mul - ; AVX512BW: cost of 9 {{.*}} %C = mul + ; SSSE3: cost of 32 {{.*}} %C = mul + ; SSE42: cost of 32 {{.*}} %C = mul + ; AVX: cost of 32 {{.*}} %C = mul + ; AVX2: cost of 16 {{.*}} %C = mul + ; AVX512F: cost of 8 {{.*}} %C = mul + ; AVX512BW: cost of 8 {{.*}} %C = mul ; AVX512DQ: cost of 1 {{.*}} %C = mul %C = mul <8 x i64> undef, undef @@ -520,12 +520,12 @@ define void @mul_2i32() { ; A <2 x i32> gets expanded to a <2 x i64> vector. ; A <2 x i64> vector multiply is implemented using ; 3 PMULUDQ and 2 PADDS and 4 shifts. - ; SSSE3: cost of 9 {{.*}} %A0 = mul - ; SSE42: cost of 9 {{.*}} %A0 = mul - ; AVX: cost of 9 {{.*}} %A0 = mul - ; AVX2: cost of 9 {{.*}} %A0 = mul - ; AVX512F: cost of 9 {{.*}} %A0 = mul - ; AVX512BW: cost of 9 {{.*}} %A0 = mul + ; SSSE3: cost of 8 {{.*}} %A0 = mul + ; SSE42: cost of 8 {{.*}} %A0 = mul + ; AVX: cost of 8 {{.*}} %A0 = mul + ; AVX2: cost of 8 {{.*}} %A0 = mul + ; AVX512F: cost of 8 {{.*}} %A0 = mul + ; AVX512BW: cost of 8 {{.*}} %A0 = mul ; AVX512DQ: cost of 1 {{.*}} %A0 = mul %A0 = mul <2 x i32> undef, undef diff --git a/test/CodeGen/X86/avx-arith.ll b/test/CodeGen/X86/avx-arith.ll index 66c09e0dfa3..82d890a08cf 100644 --- a/test/CodeGen/X86/avx-arith.ll +++ b/test/CodeGen/X86/avx-arith.ll @@ -323,24 +323,22 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone { ; CHECK: ## BB#0: ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpmuludq %xmm2, %xmm3, %xmm4 +; CHECK-NEXT: vpsrlq $32, %xmm3, %xmm4 +; CHECK-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 ; CHECK-NEXT: vpsrlq $32, %xmm2, %xmm5 ; CHECK-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; CHECK-NEXT: vpsllq $32, %xmm5, %xmm5 -; CHECK-NEXT: vpaddq %xmm5, %xmm4, %xmm4 -; CHECK-NEXT: vpsrlq $32, %xmm3, %xmm3 +; CHECK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpsllq $32, %xmm4, %xmm4 ; CHECK-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpsllq $32, %xmm2, %xmm2 -; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; CHECK-NEXT: vpsrlq $32, %xmm0, %xmm3 +; CHECK-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 ; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm4 ; CHECK-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; CHECK-NEXT: vpsllq $32, %xmm4, %xmm4 -; CHECK-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; CHECK-NEXT: vpsrlq $32, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsllq $32, %xmm3, %xmm3 ; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpsllq $32, %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = mul <4 x i64> %i, %j diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index d399e808f7f..f9c7e207b78 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -90,41 +90,38 @@ entry: define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { ; AVX512F-LABEL: imulq512: ; AVX512F: ## BB#0: -; AVX512F-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm2 +; AVX512F-NEXT: vpmuludq %zmm0, %zmm2, %zmm2 ; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3 ; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 -; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512F-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: imulq512: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm2 +; AVX512VL-NEXT: vpmuludq %zmm0, %zmm2, %zmm2 ; AVX512VL-NEXT: vpsrlq $32, %zmm0, %zmm3 ; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 -; AVX512VL-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 -; AVX512VL-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512VL-NEXT: vpaddq %zmm0, %zmm3, %zmm0 -; AVX512VL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: imulq512: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm2 +; AVX512BW-NEXT: vpmuludq %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 -; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: imulq512: @@ -143,41 +140,38 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { ; AVX512F-LABEL: imulq256: ; AVX512F: ## BB#0: -; AVX512F-NEXT: vpmuludq %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm3 ; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512F-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: imulq256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmuludq %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX512VL-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm3 ; AVX512VL-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 -; AVX512VL-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512VL-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX512VL-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: imulq256: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3 ; AVX512BW-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 -; AVX512BW-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: imulq256: @@ -199,41 +193,38 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ; AVX512F-LABEL: imulq128: ; AVX512F: ## BB#0: -; AVX512F-NEXT: vpmuludq %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm2 +; AVX512F-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm3 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512F-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512F-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpaddq %xmm0, %xmm3, %xmm0 -; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: imulq128: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmuludq %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm2 +; AVX512VL-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm3 ; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX512VL-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512VL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpaddq %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: imulq128: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm3 ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm0, %xmm3, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: imulq128: diff --git a/test/CodeGen/X86/combine-mul.ll b/test/CodeGen/X86/combine-mul.ll index 7d60a21c447..3a805828024 100644 --- a/test/CodeGen/X86/combine-mul.ll +++ b/test/CodeGen/X86/combine-mul.ll @@ -172,42 +172,39 @@ define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) { define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { ; SSE-LABEL: combine_vec_mul_negpow2c: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709551615,18446744073709551614] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm5, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709551612,18446744073709551600] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm4, %xmm1 +; SSE-NEXT: pmuludq %xmm5, %xmm0 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm4, %xmm1 ; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_mul_negpow2c: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm3 -; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] +; AVX-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpsllq $32, %ymm1, %ymm1 +; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %1 = mul <4 x i64> %x, ret <4 x i64> %1 diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 32a7c376170..7d9ef28a090 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -256,29 +256,27 @@ define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind { ; SSE-LABEL: mul_v2i64: ; SSE: # BB#0: # %entry ; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: psrlq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_v2i64: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX-NEXT: vpaddq %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq entry: %A = mul <2 x i64> %i, %j @@ -343,17 +341,16 @@ define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind { ; SSE-NEXT: callq foo ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: pmuludq %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrlq $32, %xmm1 ; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm2, %xmm1 ; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm3, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 ; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; @@ -363,18 +360,16 @@ define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind { ; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: callq foo +; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX-NEXT: vpsrlq $32, %xmm3, %xmm0 ; AVX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; AVX-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX-NEXT: vpmuludq %xmm2, %xmm4, %xmm0 +; AVX-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpsrlq $32, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm2, %xmm3 -; AVX-NEXT: vpmuludq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq $32, %xmm4, %xmm2 -; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq entry: @@ -736,40 +731,37 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind { ; SSE-LABEL: mul_v4i64: ; SSE: # BB#0: # %entry ; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm0, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 ; SSE-NEXT: paddq %xmm5, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: paddq %xmm2, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: psrlq $32, %xmm1 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 ; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: paddq %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_v4i64: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 ; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3 ; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq entry: %A = mul <4 x i64> %i, %j @@ -1423,88 +1415,83 @@ entry: define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,3,0,1] -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pmuludq %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrlq $32, %xmm5 ; SSE2-NEXT: pmuludq %xmm0, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psrlq $32, %xmm6 +; SSE2-NEXT: pmuludq %xmm4, %xmm6 +; SSE2-NEXT: paddq %xmm5, %xmm6 +; SSE2-NEXT: psllq $32, %xmm6 ; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: paddq %xmm5, %xmm0 -; SSE2-NEXT: paddq %xmm7, %xmm0 +; SSE2-NEXT: paddq %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: psrlq $32, %xmm5 +; SSE2-NEXT: pmuludq %xmm2, %xmm5 +; SSE2-NEXT: paddq %xmm4, %xmm5 +; SSE2-NEXT: psllq $32, %xmm5 +; SSE2-NEXT: pmuludq %xmm7, %xmm2 +; SSE2-NEXT: paddq %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 ; SSE2-NEXT: pmuludq %xmm9, %xmm4 ; SSE2-NEXT: movdqa %xmm9, %xmm5 ; SSE2-NEXT: psrlq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm2, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm9, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: paddq %xmm5, %xmm2 -; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pmuludq %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm5 ; SSE2-NEXT: pmuludq %xmm1, %xmm5 +; SSE2-NEXT: paddq %xmm4, %xmm5 ; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm6, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pmuludq %xmm9, %xmm1 ; SSE2-NEXT: paddq %xmm5, %xmm1 -; SSE2-NEXT: paddq %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 ; SSE2-NEXT: pmuludq %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm8, %xmm5 ; SSE2-NEXT: psrlq $32, %xmm5 ; SSE2-NEXT: pmuludq %xmm3, %xmm5 +; SSE2-NEXT: paddq %xmm4, %xmm5 ; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm3 ; SSE2-NEXT: pmuludq %xmm8, %xmm3 -; SSE2-NEXT: psllq $32, %xmm3 ; SSE2-NEXT: paddq %xmm5, %xmm3 -; SSE2-NEXT: paddq %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index 58b4e986f77..d7e99afb2f5 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -457,16 +457,15 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrlq $32, %xmm2 ; CHECK-NEXT: pmuludq %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: psrlq $32, %xmm3 ; CHECK-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-NEXT: paddq %xmm2, %xmm3 ; CHECK-NEXT: psllq $32, %xmm3 -; CHECK-NEXT: psrlq $32, %xmm1 ; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: psllq $32, %xmm1 ; CHECK-NEXT: paddq %xmm3, %xmm1 -; CHECK-NEXT: paddq %xmm2, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index 9930e683689..c49108930fa 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -1542,27 +1542,25 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v4i64_4i32: ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm1, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: psrlq $32, %xmm1 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 ; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm0, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 ; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; @@ -1570,39 +1568,36 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v4i64_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -1611,15 +1606,14 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; ; AVX512-LABEL: trunc_mul_v4i64_4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX512-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 ; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3 ; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq @@ -1632,49 +1626,45 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v8i64_8i16: ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 ; SSE-NEXT: pmuludq %xmm4, %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: psrlq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm0, %xmm9 +; SSE-NEXT: paddq %xmm8, %xmm9 ; SSE-NEXT: psllq $32, %xmm9 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 ; SSE-NEXT: paddq %xmm9, %xmm0 -; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 ; SSE-NEXT: pmuludq %xmm5, %xmm8 ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: paddq %xmm8, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: psrlq $32, %xmm1 ; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 ; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: paddq %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm6, %xmm4 ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 ; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: paddq %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm7, %xmm4 ; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: psrlq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: paddq %xmm5, %xmm3 -; SSE-NEXT: paddq %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1691,46 +1681,42 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX1-LABEL: trunc_mul_v8i64_8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 -; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 -; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 +; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 -; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 -; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 +; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -1744,24 +1730,22 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX2-LABEL: trunc_mul_v8i64_8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -1775,15 +1759,14 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX512-LABEL: trunc_mul_v8i64_8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 ; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3 ; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = mul <8 x i64> %a0, %a1 @@ -1853,100 +1836,92 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; SSE: # BB#0: ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm0, %xmm10 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm8, %xmm0 ; SSE-NEXT: paddq %xmm10, %xmm0 -; SSE-NEXT: paddq %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm1, %xmm10 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm8, %xmm1 +; SSE-NEXT: paddq %xmm8, %xmm10 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm9, %xmm1 ; SSE-NEXT: paddq %xmm10, %xmm1 -; SSE-NEXT: paddq %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm2, %xmm10 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm8, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm8, %xmm2 ; SSE-NEXT: paddq %xmm10, %xmm2 -; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm3, %xmm10 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm8, %xmm3 +; SSE-NEXT: paddq %xmm8, %xmm10 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm9, %xmm3 ; SSE-NEXT: paddq %xmm10, %xmm3 -; SSE-NEXT: paddq %xmm9, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm4, %xmm10 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm8, %xmm4 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm8, %xmm4 ; SSE-NEXT: paddq %xmm10, %xmm4 -; SSE-NEXT: paddq %xmm9, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm5, %xmm10 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm8, %xmm5 +; SSE-NEXT: paddq %xmm8, %xmm10 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm9, %xmm5 ; SSE-NEXT: paddq %xmm10, %xmm5 -; SSE-NEXT: paddq %xmm9, %xmm5 ; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm6, %xmm10 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm8, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm8, %xmm6 ; SSE-NEXT: paddq %xmm10, %xmm6 -; SSE-NEXT: paddq %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: psrlq $32, %xmm10 ; SSE-NEXT: pmuludq %xmm7, %xmm10 -; SSE-NEXT: psrlq $32, %xmm7 -; SSE-NEXT: pmuludq %xmm8, %xmm7 +; SSE-NEXT: paddq %xmm8, %xmm10 +; SSE-NEXT: pmuludq %xmm9, %xmm7 ; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: psllq $32, %xmm7 ; SSE-NEXT: paddq %xmm10, %xmm7 -; SSE-NEXT: paddq %xmm9, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -1967,86 +1942,78 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX1-LABEL: trunc_mul_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm8 +; AVX1-NEXT: vpmuludq %xmm4, %xmm8, %xmm8 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9 ; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9 -; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm10 -; AVX1-NEXT: vpmuludq %xmm4, %xmm10, %xmm10 -; AVX1-NEXT: vpsllq $32, %xmm10, %xmm10 -; AVX1-NEXT: vpaddq %xmm10, %xmm9, %xmm9 -; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm10 +; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpsllq $32, %xmm8, %xmm8 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm9 -; AVX1-NEXT: vpsrlq $32, %xmm10, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 +; AVX1-NEXT: vpmuludq %xmm9, %xmm4, %xmm10 +; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm4 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpaddq %xmm10, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm9, %xmm9 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm10 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0 -; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm9 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 ; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0 +; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm10, %xmm10 +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm10 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 +; AVX1-NEXT: vpmuludq %xmm0, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm1 -; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm0 +; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4 ; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 +; AVX1-NEXT: vpmuludq %xmm0, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 -; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 -; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 +; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7 -; AVX1-NEXT: vpsllq $32, %xmm7, %xmm7 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpaddq %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 @@ -2068,42 +2035,38 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-LABEL: trunc_mul_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm8 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm8 +; AVX2-NEXT: vpmuludq %ymm5, %ymm8, %ymm8 ; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9 ; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9 -; AVX2-NEXT: vpsllq $32, %ymm9, %ymm9 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm8, %ymm9, %ymm8 +; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vpaddq %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm5 +; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm5 ; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8 ; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8 -; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm4 +; AVX2-NEXT: vpaddq %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4 +; AVX2-NEXT: vpmuludq %ymm7, %ymm4, %ymm4 ; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5 ; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 ; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm4 +; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX2-NEXT: vpmuludq %ymm6, %ymm4, %ymm4 ; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5 ; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 ; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -2128,24 +2091,22 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512-LABEL: trunc_mul_v16i64_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm4 +; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm4 ; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm5 ; AVX512-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512-NEXT: vpsllq $32, %zmm5, %zmm5 -; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512-NEXT: vpaddq %zmm4, %zmm5, %zmm4 +; AVX512-NEXT: vpsllq $32, %zmm4, %zmm4 ; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1 -; AVX512-NEXT: vpaddq %zmm1, %zmm5, %zmm1 -; AVX512-NEXT: vpaddq %zmm1, %zmm4, %zmm1 -; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 +; AVX512-NEXT: vpaddq %zmm4, %zmm1, %zmm1 +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512-NEXT: vpmuludq %zmm2, %zmm3, %zmm3 ; AVX512-NEXT: vpsrlq $32, %zmm2, %zmm4 ; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 -; AVX512-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpaddq %zmm3, %zmm4, %zmm3 +; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3 ; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpaddq %zmm0, %zmm4, %zmm0 -; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -5101,42 +5062,40 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 ; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: paddq %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm0, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 ; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm0 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]