[SLP] Fix for PR30626: Compiler crash inside SLP Vectorizer.

After successfull horizontal reduction vectorization attempt for PHI node vectorizer tries to update root binary op by combining vectorized tree and the ReductionPHI node. But during vectorization this ReductionPHI can be vectorized itself and replaced by the `undef` value, while the instruction itself is marked for deletion. This 'marked for deletion' PHI node then can be used in new binary operation, causing "Use still stuck around after Def is destroyed" crash upon PHI node deletion. Also the test is fixed to make it perform actual testing. Differential Revision: https://reviews.llvm.org/D25671 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285286 91177308-0d34-0410-b5e6-96231b3b80d8
2025-03-05 03:19:11 +00:00 · 2016-10-27 12:02:28 +00:00 · 2016-10-27 12:02:28 +00:00 · d6d83fe649
commit d6d83fe649
parent 5e7dda5a1f
2 changed files with 164 additions and 62 deletions
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -4062,7 +4062,14 @@ class HorizontalReduction {
  SmallVector<Value *, 32> ReducedVals;

  BinaryOperator *ReductionRoot;
-  PHINode *ReductionPHI;
+  // After successfull horizontal reduction vectorization attempt for PHI node
+  // vectorizer tries to update root binary op by combining vectorized tree and
+  // the ReductionPHI node. But during vectorization this ReductionPHI can be
+  // vectorized itself and replaced by the undef value, while the instruction
+  // itself is marked for deletion. This 'marked for deletion' PHI node then can
+  // be used in new binary operation, causing "Use still stuck around after Def
+  // is destroyed" crash upon PHI node deletion.
+  WeakVH ReductionPHI;

  /// The opcode of the reduction.
  unsigned ReductionOpcode;
@ -4081,8 +4088,8 @@ public:
  unsigned MinVecRegSize;

  HorizontalReduction(unsigned MinVecRegSize)
-      : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
-        ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
+      : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
+        IsPairwiseReduction(false), ReduxWidth(0),
        MinVecRegSize(MinVecRegSize) {}

  /// \brief Try to find a reduction tree.
@ -4247,7 +4254,7 @@ public:
                                     ReducedVals[i]);
      }
      // Update users.
-      if (ReductionPHI) {
+      if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
        assert(ReductionRoot && "Need a reduction operation");
        ReductionRoot->setOperand(0, VectorizedTree);
        ReductionRoot->setOperand(1, ReductionPHI);
--- a/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@ -1,6 +1,5 @@
-; RUN: opt -slp-vectorizer -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s

 ; #include <stdint.h>
 ;
@ -15,9 +14,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;   return sum;
 ; }

-; NOSTORE-LABEL: add_red
-; NOSTORE: fmul <4 x float>
-; NOSTORE: shufflevector <4 x float>
+; CHECK-LABEL: add_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>

 define i32 @add_red(float* %A, i32 %n) {
 entry:
@ -148,8 +147,8 @@ for.end:
 ; }

 ; CHECK-LABEL: long_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; CHECK: fmul fast <8 x float>
+; CHECK: shufflevector <8 x float>

 define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
 entry:
@ -305,6 +304,149 @@ for.end:
  ret i32 %sum.0.lcssa
 }

+; void foo(const float *arg_A, unsigned arg_B, float *array) {
+;   for (uint32_t i = 0; i < 6; ++i) {
+;     const float *ptr = arg_A + i;
+;     float w0 = array[i * 4 + 0];
+;     float w1 = array[i * 4 + 1];
+;     float w2 = array[i * 4 + 2];
+;     float w3 = array[i * 4 + 3];
+;
+;     for (unsigned j = 0; j < arg_B; ++j) {
+;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
+;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
+;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
+;       const float x4 = x3 + (-4.0f * w2) + w3;
+;       w1 = w0;
+;       w0 = x1;
+;       w3 = w2;
+;       w2 = x3;
+;     }
+;
+;     array[i * 4 + 0] = w0;
+;     array[i * 4 + 1] = w1;
+;     array[i * 4 + 2] = w2;
+;     array[i * 4 + 3] = w3;
+;   }
+; }
+
+define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
+; CHECK-LABEL: @foo(
+; CHECK: fmul fast <4 x float>
+; CHECK: shufflevector <4 x float>
+;
+entry:
+  %cmp1495 = icmp eq i32 %arg_B, 0
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup15, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
+  %0 = shl i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds float, float* %array, i64 %0
+  %1 = load float, float* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
+  %3 = load float, float* %arrayidx4, align 4
+  %4 = or i64 %0, 2
+  %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
+  %5 = load float, float* %arrayidx8, align 4
+  %6 = or i64 %0, 3
+  %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
+  %7 = load float, float* %arrayidx12, align 4
+  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
+
+for.body16.lr.ph:                                 ; preds = %for.body
+  %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
+  %8 = load float, float* %add.ptr, align 4
+  br label %for.body16
+
+for.cond.cleanup15:                               ; preds = %for.body16, %for.body
+  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
+  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
+  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
+  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
+  store float %w0.0.lcssa, float* %arrayidx, align 4
+  store float %w1.0.lcssa, float* %arrayidx4, align 4
+  store float %w2.0.lcssa, float* %arrayidx8, align 4
+  store float %w3.0.lcssa, float* %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
+  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
+
+for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
+  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
+  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
+  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
+  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
+  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
+  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
+  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
+  %sub92 = fadd fast float %mul17, %mul18.neg
+  %sub19 = fadd fast float %sub92, %8
+  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
+  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
+  %mul23 = fmul fast float %w1.099, 0x4002666660000000
+  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
+  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
+  %add2293 = fadd fast float %mul27.neg, %mul25
+  %add24 = fadd fast float %add2293, %mul23
+  %sub2694 = fadd fast float %add24, %mul21.neg
+  %sub28 = fadd fast float %sub2694, %mul20
+  %inc = add nuw i32 %j.098, 1
+  %exitcond = icmp eq i32 %inc, %arg_B
+  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
+}
+
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+;          int n) {
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
+;   }
+; }
+
+; STORE-LABEL: store_red_double
+; STORE: fmul fast <2 x double>
+; STORE: extractelement <2 x double>
+; STORE: extractelement <2 x double>
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+entry:
+  %cmp17 = icmp sgt i32 %n, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load double, double* %B, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
+  %1 = load double, double* %arrayidx4, align 8
+  %2 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = shl nsw i64 %i.018, 2
+  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
+  %3 = load double, double* %arrayidx2, align 8
+  %mul3 = fmul fast double %0, %3
+  %add16 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
+  %4 = load double, double* %arrayidx6, align 8
+  %mul7 = fmul fast double %1, %4
+  %add8 = fadd fast double %mul3, %mul7
+  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
+  store double %add8, double* %arrayidx9, align 8
+  %inc = add nsw i64 %i.018, 1
+  %exitcond = icmp eq i64 %inc, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
 ;   float sum = 0;
 ;   for (intptr_t i=0; i < n; ++i) {
@ -316,9 +458,9 @@ for.end:
 ;   return sum;
 ; }

-; CHECK-LABEL: store_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; STORE-LABEL: store_red
+; STORE: fmul fast <4 x float>
+; STORE: shufflevector <4 x float>

 define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
 entry:
@ -368,50 +510,3 @@ for.end:
  ret i32 0
 }

-
-; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
-
-; void foo(double * restrict A, double * restrict B, double * restrict C,
-;          int n) {
-;   for (intptr_t i=0; i < n; ++i) {
-;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
-;   }
-; }
-
-; STORE-LABEL: store_red_double
-; STORE: fmul fast <2 x double>
-; STORE: extractelement <2 x double>
-; STORE: extractelement <2 x double>
-
-define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
-entry:
-  %cmp17 = icmp sgt i32 %n, 0
-  br i1 %cmp17, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:
-  %0 = load double, double* %B, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
-  %1 = load double, double* %arrayidx4, align 8
-  %2 = sext i32 %n to i64
-  br label %for.body
-
-for.body:
-  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %mul = shl nsw i64 %i.018, 2
-  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
-  %3 = load double, double* %arrayidx2, align 8
-  %mul3 = fmul fast double %0, %3
-  %add16 = or i64 %mul, 1
-  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
-  %4 = load double, double* %arrayidx6, align 8
-  %mul7 = fmul fast double %1, %4
-  %add8 = fadd fast double %mul3, %mul7
-  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
-  store double %add8, double* %arrayidx9, align 8
-  %inc = add nsw i64 %i.018, 1
-  %exitcond = icmp eq i64 %inc, %2
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}