[InstCombine] improve demanded vector elements analysis of insertelement

Recurse instead of returning on the first found optimization. Also, return early in the caller instead of continuing because that allows another round of simplification before we might potentially lose undef information from a shuffle mask by eliminating the shuffle. As noted in the review, we could probably do better and be more efficient by moving all of demanded elements into a separate pass, but this is yet another quick fix to instcombine. Differential Revision: https://reviews.llvm.org/D37236 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312248 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-12 13:48:45 +00:00 · 2017-08-31 15:57:17 +00:00 · 2017-08-31 15:57:17 +00:00 · da536d4e17
commit da536d4e17
parent aa74e1b97a
4 changed files with 19 additions and 26 deletions
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@ -993,22 +993,23 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
      break;
    }

+    // The element inserted overwrites whatever was there, so the input demanded
+    // set is simpler than the output set.
+    unsigned IdxNo = Idx->getZExtValue();
+    APInt PreInsertDemandedElts = DemandedElts;
+    if (IdxNo < VWidth)
+      PreInsertDemandedElts.clearBit(IdxNo);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), PreInsertDemandedElts,
+                                      UndefElts, Depth + 1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
    // If this is inserting an element that isn't demanded, remove this
    // insertelement.
-    unsigned IdxNo = Idx->getZExtValue();
    if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
      Worklist.Add(I);
      return I->getOperand(0);
    }

-    // Otherwise, the element inserted overwrites whatever was there, so the
-    // input demanded set is simpler than the output set.
-    APInt DemandedElts2 = DemandedElts;
-    DemandedElts2.clearBit(IdxNo);
-    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
-                                      UndefElts, Depth + 1);
-    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
-
    // The inserted element is defined.
    UndefElts.clearBit(IdxNo);
    break;
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@ -1165,9 +1165,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
  if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
    if (V != &SVI)
      return replaceInstUsesWith(SVI, V);
-    LHS = SVI.getOperand(0);
-    RHS = SVI.getOperand(1);
-    MadeChange = true;
+    return &SVI;
  }

  unsigned LHSWidth = LHS->getType()->getVectorNumElements();
--- a/test/Transforms/InstCombine/X86/x86-pshufb.ll
+++ b/test/Transforms/InstCombine/X86/x86-pshufb.ll
@ -485,9 +485,8 @@ define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask,

 define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
 ; CHECK-LABEL: @demanded_elts_insertion_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]])
-; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %BaseMask)
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
 ;
  %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
  %2 = insertelement <32 x i8> %1, i8 %M22, i32 22
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@ -142,13 +142,11 @@ define <2 x i64> @PR24922(<2 x i64> %v) {
  ret <2 x i64> %result
 }

-; FIXME: The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away.
+; The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away.

 define <4 x float> @inselt_shuf_no_demand(float %a1, float %a2, float %a3) {
 ; CHECK-LABEL: @inselt_shuf_no_demand(
-; CHECK-NEXT:    [[OUT1:%.*]] = insertelement <4 x float> undef, float %a1, i32 1
-; CHECK-NEXT:    [[OUT12:%.*]] = insertelement <4 x float> [[OUT1]], float %a2, i32 2
-; CHECK-NEXT:    ret <4 x float> [[OUT12]]
+; CHECK-NEXT:    ret <4 x float> undef
 ;
  %out1 = insertelement <4 x float> undef, float %a1, i32 1
  %out12 = insertelement <4 x float> %out1, float %a2, i32 2
@ -157,13 +155,11 @@ define <4 x float> @inselt_shuf_no_demand(float %a1, float %a2, float %a3) {
  ret <4 x float> %shuffle
 }

-; FIXME: The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away.
+; The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away.

 define <4 x float> @inselt_shuf_no_demand_commute(float %a1, float %a2, float %a3) {
 ; CHECK-LABEL: @inselt_shuf_no_demand_commute(
-; CHECK-NEXT:    [[OUT1:%.*]] = insertelement <4 x float> undef, float %a1, i32 1
-; CHECK-NEXT:    [[OUT12:%.*]] = insertelement <4 x float> [[OUT1]], float %a2, i32 2
-; CHECK-NEXT:    ret <4 x float> [[OUT12]]
+; CHECK-NEXT:    ret <4 x float> undef
 ;
  %out1 = insertelement <4 x float> undef, float %a1, i32 1
  %out12 = insertelement <4 x float> %out1, float %a2, i32 2
@ -172,15 +168,14 @@ define <4 x float> @inselt_shuf_no_demand_commute(float %a1, float %a2, float %a
  ret <4 x float> %shuffle
 }

-; FIXME: The add uses 'out012' giving it multiple uses after the shuffle is transformed to also
+; The add uses 'out012' giving it multiple uses after the shuffle is transformed to also
 ; use 'out012'. The analysis should be able to see past that.

 define <4 x i32> @inselt_shuf_no_demand_multiuse(i32 %a0, i32 %a1, <4 x i32> %b) {
 ; CHECK-LABEL: @inselt_shuf_no_demand_multiuse(
 ; CHECK-NEXT:    [[OUT0:%.*]] = insertelement <4 x i32> undef, i32 %a0, i32 0
 ; CHECK-NEXT:    [[OUT01:%.*]] = insertelement <4 x i32> [[OUT0]], i32 %a1, i32 1
-; CHECK-NEXT:    [[OUT012:%.*]] = insertelement <4 x i32> [[OUT01]], i32 %a0, i32 2
-; CHECK-NEXT:    [[FOO:%.*]] = add <4 x i32> [[OUT012]], %b
+; CHECK-NEXT:    [[FOO:%.*]] = add <4 x i32> [[OUT01]], %b
 ; CHECK-NEXT:    ret <4 x i32> [[FOO]]
 ;
  %out0 = insertelement <4 x i32> undef, i32 %a0, i32 0