[X86] Emit a PACKUS instead of a VECTOR_SHUFFLE from LowerTRUNCATE for v16i16->v16i8.

We can't guarantee that demanded bits passing through the vector shuffle won't cause the AND in front of this to be removed. This would prevent the PACKUS from being matched during shuffle lowering. Unfortunately, this adds a packuswb to one of the vector-reduce-mul.ll tests since we were removing the shuffle via SimplifyDemandedVectorElts. We appear to have similar issues with vpmovwb on the same test case on other targets. llvm-svn: 347361
2024-11-29 22:30:33 +00:00 · 2018-11-20 22:57:48 +00:00 · 2018-11-20 22:57:48 +00:00 · 285da5c731
commit 285da5c731
parent e7be680ca1
2 changed files with 3 additions and 7 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -18100,19 +18100,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
  }

  if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
-    // Use an AND to force a PACKUS.
+    // Use an AND to zero uppper bits for PACKUS.
    In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

    SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
                               DAG.getIntPtrConstant(0, DL));
    SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
                               DAG.getIntPtrConstant(8, DL));
-    InLo = DAG.getBitcast(VT, InLo);
-    InHi = DAG.getBitcast(VT, InHi);
-
-    return DAG.getVectorShuffle(VT, DL, InLo, InHi,
-                                { 0,  2,  4,  6,  8, 10, 12, 14,
-                                 16, 18, 20, 22, 24, 26, 28, 30});
+    return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
  }

  // Handle truncation of V256 to V128 using shuffles.
--- a/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/test/CodeGen/X86/vector-reduce-mul.ll
@ -1576,6 +1576,7 @@ define i8 @test_v16i8(<16 x i8> %a0) {
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper