[SLP] Fix for PR30626: Compiler crash inside SLP Vectorizer.

After successfull horizontal reduction vectorization attempt for PHI node
vectorizer tries to update root binary op by combining vectorized tree
and the ReductionPHI node. But during vectorization this ReductionPHI
can be vectorized itself and replaced by the `undef` value, while the
instruction itself is marked for deletion. This 'marked for deletion'
PHI node then can be used in new binary operation, causing "Use still
stuck around after Def is destroyed" crash upon PHI node deletion.

Also the test is fixed to make it perform actual testing.

Differential Revision: https://reviews.llvm.org/D25671

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285286 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Alexey Bataev 2016-10-27 12:02:28 +00:00
parent 5e7dda5a1f
commit d6d83fe649
2 changed files with 164 additions and 62 deletions

View File

@ -4062,7 +4062,14 @@ class HorizontalReduction {
SmallVector<Value *, 32> ReducedVals;
BinaryOperator *ReductionRoot;
PHINode *ReductionPHI;
// After successfull horizontal reduction vectorization attempt for PHI node
// vectorizer tries to update root binary op by combining vectorized tree and
// the ReductionPHI node. But during vectorization this ReductionPHI can be
// vectorized itself and replaced by the undef value, while the instruction
// itself is marked for deletion. This 'marked for deletion' PHI node then can
// be used in new binary operation, causing "Use still stuck around after Def
// is destroyed" crash upon PHI node deletion.
WeakVH ReductionPHI;
/// The opcode of the reduction.
unsigned ReductionOpcode;
@ -4081,8 +4088,8 @@ public:
unsigned MinVecRegSize;
HorizontalReduction(unsigned MinVecRegSize)
: ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
: ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
IsPairwiseReduction(false), ReduxWidth(0),
MinVecRegSize(MinVecRegSize) {}
/// \brief Try to find a reduction tree.
@ -4247,7 +4254,7 @@ public:
// Update users.
if (ReductionPHI) {
if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
assert(ReductionRoot && "Need a reduction operation");
ReductionRoot->setOperand(0, VectorizedTree);
ReductionRoot->setOperand(1, ReductionPHI);

View File

@ -1,6 +1,5 @@
; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
; #include <stdint.h>
@ -15,9 +14,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; return sum;
; }
; NOSTORE-LABEL: add_red
; NOSTORE: fmul <4 x float>
; NOSTORE: shufflevector <4 x float>
; CHECK-LABEL: add_red
; CHECK: fmul <4 x float>
; CHECK: shufflevector <4 x float>
define i32 @add_red(float* %A, i32 %n) {
@ -148,8 +147,8 @@ for.end:
; }
; CHECK-LABEL: long_red
; CHECK: fmul fast <4 x float>
; CHECK: shufflevector <4 x float>
; CHECK: fmul fast <8 x float>
; CHECK: shufflevector <8 x float>
define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
@ -305,6 +304,149 @@ for.end:
ret i32 %sum.0.lcssa
; void foo(const float *arg_A, unsigned arg_B, float *array) {
; for (uint32_t i = 0; i < 6; ++i) {
; const float *ptr = arg_A + i;
; float w0 = array[i * 4 + 0];
; float w1 = array[i * 4 + 1];
; float w2 = array[i * 4 + 2];
; float w3 = array[i * 4 + 3];
; for (unsigned j = 0; j < arg_B; ++j) {
; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
; const float x4 = x3 + (-4.0f * w2) + w3;
; w1 = w0;
; w0 = x1;
; w3 = w2;
; w2 = x3;
; }
; array[i * 4 + 0] = w0;
; array[i * 4 + 1] = w1;
; array[i * 4 + 2] = w2;
; array[i * 4 + 3] = w3;
; }
; }
define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
; CHECK-LABEL: @foo(
; CHECK: fmul fast <4 x float>
; CHECK: shufflevector <4 x float>
%cmp1495 = icmp eq i32 %arg_B, 0
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup15
ret void
for.body: ; preds = %for.cond.cleanup15, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
%0 = shl i64 %indvars.iv, 2
%arrayidx = getelementptr inbounds float, float* %array, i64 %0
%1 = load float, float* %arrayidx, align 4
%2 = or i64 %0, 1
%arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
%3 = load float, float* %arrayidx4, align 4
%4 = or i64 %0, 2
%arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
%5 = load float, float* %arrayidx8, align 4
%6 = or i64 %0, 3
%arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
%7 = load float, float* %arrayidx12, align 4
br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
for.body16.lr.ph: ; preds = %for.body
%add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
%8 = load float, float* %add.ptr, align 4
br label %for.body16
for.cond.cleanup15: ; preds = %for.body16, %for.body
%w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
%w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
%w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
%w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
store float %w0.0.lcssa, float* %arrayidx, align 4
store float %w1.0.lcssa, float* %arrayidx4, align 4
store float %w2.0.lcssa, float* %arrayidx8, align 4
store float %w3.0.lcssa, float* %arrayidx12, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond109 = icmp eq i64 %indvars.iv.next, 6
br i1 %exitcond109, label %for.cond.cleanup, label %for.body
for.body16: ; preds = %for.body16, %for.body16.lr.ph
%w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
%w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
%j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
%w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
%w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
%mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
%mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
%sub92 = fadd fast float %mul17, %mul18.neg
%sub19 = fadd fast float %sub92, %8
%mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
%mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
%mul23 = fmul fast float %w1.099, 0x4002666660000000
%mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
%mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
%add2293 = fadd fast float %mul27.neg, %mul25
%add24 = fadd fast float %add2293, %mul23
%sub2694 = fadd fast float %add24, %mul21.neg
%sub28 = fadd fast float %sub2694, %mul20
%inc = add nuw i32 %j.098, 1
%exitcond = icmp eq i32 %inc, %arg_B
br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
; void foo(double * restrict A, double * restrict B, double * restrict C,
; int n) {
; for (intptr_t i=0; i < n; ++i) {
; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
; }
; }
; STORE-LABEL: store_red_double
; STORE: fmul fast <2 x double>
; STORE: extractelement <2 x double>
; STORE: extractelement <2 x double>
define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
%cmp17 = icmp sgt i32 %n, 0
br i1 %cmp17, label %for.body.lr.ph, label %for.end
%0 = load double, double* %B, align 8
%arrayidx4 = getelementptr inbounds double, double* %B, i64 1
%1 = load double, double* %arrayidx4, align 8
%2 = sext i32 %n to i64
br label %for.body
%i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
%mul = shl nsw i64 %i.018, 2
%arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
%3 = load double, double* %arrayidx2, align 8
%mul3 = fmul fast double %0, %3
%add16 = or i64 %mul, 1
%arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
%4 = load double, double* %arrayidx6, align 8
%mul7 = fmul fast double %1, %4
%add8 = fadd fast double %mul3, %mul7
%arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
store double %add8, double* %arrayidx9, align 8
%inc = add nsw i64 %i.018, 1
%exitcond = icmp eq i64 %inc, %2
br i1 %exitcond, label %for.end, label %for.body
ret void
; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
; float sum = 0;
; for (intptr_t i=0; i < n; ++i) {
@ -316,9 +458,9 @@ for.end:
; return sum;
; }
; CHECK-LABEL: store_red
; CHECK: fmul fast <4 x float>
; CHECK: shufflevector <4 x float>
; STORE-LABEL: store_red
; STORE: fmul fast <4 x float>
; STORE: shufflevector <4 x float>
define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
@ -368,50 +510,3 @@ for.end:
ret i32 0
; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
; void foo(double * restrict A, double * restrict B, double * restrict C,
; int n) {
; for (intptr_t i=0; i < n; ++i) {
; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
; }
; }
; STORE-LABEL: store_red_double
; STORE: fmul fast <2 x double>
; STORE: extractelement <2 x double>
; STORE: extractelement <2 x double>
define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
%cmp17 = icmp sgt i32 %n, 0
br i1 %cmp17, label %for.body.lr.ph, label %for.end
%0 = load double, double* %B, align 8
%arrayidx4 = getelementptr inbounds double, double* %B, i64 1
%1 = load double, double* %arrayidx4, align 8
%2 = sext i32 %n to i64
br label %for.body
%i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
%mul = shl nsw i64 %i.018, 2
%arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
%3 = load double, double* %arrayidx2, align 8
%mul3 = fmul fast double %0, %3
%add16 = or i64 %mul, 1
%arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
%4 = load double, double* %arrayidx6, align 8
%mul7 = fmul fast double %1, %4
%add8 = fadd fast double %mul3, %mul7
%arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
store double %add8, double* %arrayidx9, align 8
%inc = add nsw i64 %i.018, 1
%exitcond = icmp eq i64 %inc, %2
br i1 %exitcond, label %for.end, label %for.body
ret void