Revert "[SLP] Fix for PR6246: vectorization for scalar ops on vector elements."

This reverts commit r288497, as it broke the AArch64 build of Compiler-RT's
builtins (twice: once in r288412 and once in r288497). We should investigate
this offline.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288508 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Renato Golin 2016-12-02 16:56:26 +00:00
parent 42285f5cb7
commit 66e793d0de
3 changed files with 836 additions and 436 deletions

View File

@ -3870,11 +3870,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Opcode0 = I0->getOpcode();
// FIXME: Register size should be a parameter to this function, so we can
// try different vectorization factors.
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
if (MaxVF < 2)
return false;
unsigned VF = R.getMinVecRegSize() / Sz;
for (Value *V : VL) {
Type *Ty = V->getType();
@ -3890,83 +3889,76 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
unsigned NextInst = 0, MaxInst = VL.size();
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
VF /= 2) {
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned OpsWidth = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
unsigned OpsWidth = 0;
if (I + VF > MaxInst)
OpsWidth = MaxInst - I;
else
OpsWidth = VF;
if (i + VF > e)
OpsWidth = e - i;
else
OpsWidth = VF;
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
break;
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
break;
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
continue;
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
continue;
DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
<< "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
<< "\n");
ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
ArrayRef<Value *> BuildVectorSlice;
if (!BuildVector.empty())
BuildVectorSlice = BuildVector.slice(I, OpsWidth);
ArrayRef<Value *> BuildVectorSlice;
if (!BuildVector.empty())
BuildVectorSlice = BuildVector.slice(i, OpsWidth);
R.buildTree(Ops, BuildVectorSlice);
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && R.shouldReorder()) {
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here from
// tryToVectorizePair().
assert(Ops.size() == 2);
assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
}
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.buildTree(Ops, BuildVectorSlice);
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && R.shouldReorder()) {
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here from
// tryToVectorizePair().
assert(Ops.size() == 2);
assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = { Ops[1], Ops[0] };
R.buildTree(ReorderedOps, None);
}
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
Value *VectorizedRoot = R.vectorizeTree();
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
Value *VectorizedRoot = R.vectorizeTree();
// Reconstruct the build vector by extracting the vectorized root. This
// way we handle the case where some elements of the vector are
// undefined.
// (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
if (!BuildVectorSlice.empty()) {
// The insert point is the last build vector instruction. The
// vectorized root will precede it. This guarantees that we get an
// instruction. The vectorized tree could have been constant folded.
Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
unsigned VecIdx = 0;
for (auto &V : BuildVectorSlice) {
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
++BasicBlock::iterator(InsertAfter));
Instruction *I = cast<Instruction>(V);
assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
Instruction *Extract =
cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
I->setOperand(1, Extract);
I->removeFromParent();
I->insertAfter(Extract);
InsertAfter = I;
}
// Reconstruct the build vector by extracting the vectorized root. This
// way we handle the case where some elements of the vector are undefined.
// (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
if (!BuildVectorSlice.empty()) {
// The insert point is the last build vector instruction. The vectorized
// root will precede it. This guarantees that we get an instruction. The
// vectorized tree could have been constant folded.
Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
unsigned VecIdx = 0;
for (auto &V : BuildVectorSlice) {
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
++BasicBlock::iterator(InsertAfter));
Instruction *I = cast<Instruction>(V);
assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
I->setOperand(1, Extract);
I->removeFromParent();
I->insertAfter(Extract);
InsertAfter = I;
}
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
Changed = true;
}
// Move to the next bundle.
i += VF - 1;
Changed = true;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -616,38 +616,42 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
; CHECK-LABEL: @multi_tree(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
; CHECK-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double %w, i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double %x, i32 1
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 1.000000e+00>
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double %y, i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double %z, i32 1
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 2.000000e+00, double 3.000000e+00>
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP10]], i32 1
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
; CHECK-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP11]], i32 0
; CHECK-NEXT: ret <4 x double> [[I4]]
;
; ZEROTHRESH-LABEL: @multi_tree(
; ZEROTHRESH-NEXT: entry:
; ZEROTHRESH-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
; ZEROTHRESH-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
; ZEROTHRESH-NEXT: [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
; ZEROTHRESH-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
; ZEROTHRESH-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
; ZEROTHRESH-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
; ZEROTHRESH-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
; ZEROTHRESH-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double %w, i32 0
; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double %x, i32 1
; ZEROTHRESH-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 1.000000e+00>
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double %y, i32 0
; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double %z, i32 1
; ZEROTHRESH-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 2.000000e+00, double 3.000000e+00>
; ZEROTHRESH-NEXT: [[TMP6:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP2]]
; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
; ZEROTHRESH-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3
; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
; ZEROTHRESH-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
; ZEROTHRESH-NEXT: [[TMP9:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP5]]
; ZEROTHRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
; ZEROTHRESH-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP10]], i32 1
; ZEROTHRESH-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
; ZEROTHRESH-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP11]], i32 0
; ZEROTHRESH-NEXT: ret <4 x double> [[I4]]
;
entry:
@ -669,44 +673,92 @@ entry:
define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
; CHECK-LABEL: @_vadd256(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <8 x float> %a, i32 0
; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <8 x float> %b, i32 0
; CHECK-NEXT: [[VECEXT2:%.*]] = extractelement <8 x float> %a, i32 1
; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <8 x float> %b, i32 1
; CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <8 x float> %a, i32 2
; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <8 x float> %b, i32 2
; CHECK-NEXT: [[VECEXT8:%.*]] = extractelement <8 x float> %a, i32 3
; CHECK-NEXT: [[VECEXT9:%.*]] = extractelement <8 x float> %b, i32 3
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> undef, float [[VECEXT]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[VECEXT2]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[VECEXT5]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[VECEXT8]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> undef, float [[VECEXT1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[VECEXT3]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[VECEXT6]], i32 2
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[VECEXT9]], i32 3
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[VECEXT11:%.*]] = extractelement <8 x float> %a, i32 4
; CHECK-NEXT: [[VECEXT12:%.*]] = extractelement <8 x float> %b, i32 4
; CHECK-NEXT: [[VECEXT14:%.*]] = extractelement <8 x float> %a, i32 5
; CHECK-NEXT: [[VECEXT15:%.*]] = extractelement <8 x float> %b, i32 5
; CHECK-NEXT: [[VECEXT17:%.*]] = extractelement <8 x float> %a, i32 6
; CHECK-NEXT: [[VECEXT18:%.*]] = extractelement <8 x float> %b, i32 6
; CHECK-NEXT: [[VECEXT20:%.*]] = extractelement <8 x float> %a, i32 7
; CHECK-NEXT: [[VECEXT21:%.*]] = extractelement <8 x float> %b, i32 7
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[VECEXT11]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[VECEXT14]], i32 1
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[VECEXT17]], i32 2
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[VECEXT20]], i32 3
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[VECEXT12]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[VECEXT15]], i32 1
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[VECEXT18]], i32 2
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[VECEXT21]], i32 3
; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x float> [[TMP12]], [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP8]], i32 0
; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP18]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP8]], i32 1
; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP19]], i32 1
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP20]], i32 2
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP21]], i32 3
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP17]], i32 0
; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP22]], i32 4
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP17]], i32 1
; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP23]], i32 5
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP17]], i32 2
; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP24]], i32 6
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP17]], i32 3
; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP25]], i32 7
; CHECK-NEXT: ret <8 x float> [[VECINIT7_I]]
;
; ZEROTHRESH-LABEL: @_vadd256(
; ZEROTHRESH-NEXT: entry:
; ZEROTHRESH-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
; ZEROTHRESH-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
; ZEROTHRESH-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
; ZEROTHRESH-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
; ZEROTHRESH-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
; ZEROTHRESH-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
; ZEROTHRESH-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
; ZEROTHRESH-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
; ZEROTHRESH-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
; ZEROTHRESH-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
; ZEROTHRESH-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
; ZEROTHRESH-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
; ZEROTHRESH-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
; ZEROTHRESH-NEXT: [[VECEXT:%.*]] = extractelement <8 x float> %a, i32 0
; ZEROTHRESH-NEXT: [[VECEXT1:%.*]] = extractelement <8 x float> %b, i32 0
; ZEROTHRESH-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
; ZEROTHRESH-NEXT: [[VECEXT2:%.*]] = extractelement <8 x float> %a, i32 1
; ZEROTHRESH-NEXT: [[VECEXT3:%.*]] = extractelement <8 x float> %b, i32 1
; ZEROTHRESH-NEXT: [[ADD4:%.*]] = fadd float [[VECEXT2]], [[VECEXT3]]
; ZEROTHRESH-NEXT: [[VECEXT5:%.*]] = extractelement <8 x float> %a, i32 2
; ZEROTHRESH-NEXT: [[VECEXT6:%.*]] = extractelement <8 x float> %b, i32 2
; ZEROTHRESH-NEXT: [[ADD7:%.*]] = fadd float [[VECEXT5]], [[VECEXT6]]
; ZEROTHRESH-NEXT: [[VECEXT8:%.*]] = extractelement <8 x float> %a, i32 3
; ZEROTHRESH-NEXT: [[VECEXT9:%.*]] = extractelement <8 x float> %b, i32 3
; ZEROTHRESH-NEXT: [[ADD10:%.*]] = fadd float [[VECEXT8]], [[VECEXT9]]
; ZEROTHRESH-NEXT: [[VECEXT11:%.*]] = extractelement <8 x float> %a, i32 4
; ZEROTHRESH-NEXT: [[VECEXT12:%.*]] = extractelement <8 x float> %b, i32 4
; ZEROTHRESH-NEXT: [[ADD13:%.*]] = fadd float [[VECEXT11]], [[VECEXT12]]
; ZEROTHRESH-NEXT: [[VECEXT14:%.*]] = extractelement <8 x float> %a, i32 5
; ZEROTHRESH-NEXT: [[VECEXT15:%.*]] = extractelement <8 x float> %b, i32 5
; ZEROTHRESH-NEXT: [[ADD16:%.*]] = fadd float [[VECEXT14]], [[VECEXT15]]
; ZEROTHRESH-NEXT: [[VECEXT17:%.*]] = extractelement <8 x float> %a, i32 6
; ZEROTHRESH-NEXT: [[VECEXT18:%.*]] = extractelement <8 x float> %b, i32 6
; ZEROTHRESH-NEXT: [[ADD19:%.*]] = fadd float [[VECEXT17]], [[VECEXT18]]
; ZEROTHRESH-NEXT: [[VECEXT20:%.*]] = extractelement <8 x float> %a, i32 7
; ZEROTHRESH-NEXT: [[VECEXT21:%.*]] = extractelement <8 x float> %b, i32 7
; ZEROTHRESH-NEXT: [[ADD22:%.*]] = fadd float [[VECEXT20]], [[VECEXT21]]
; ZEROTHRESH-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[ADD]], i32 0
; ZEROTHRESH-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[ADD4]], i32 1
; ZEROTHRESH-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[ADD7]], i32 2
; ZEROTHRESH-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[ADD10]], i32 3
; ZEROTHRESH-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[ADD13]], i32 4
; ZEROTHRESH-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[ADD16]], i32 5
; ZEROTHRESH-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[ADD19]], i32 6
; ZEROTHRESH-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[ADD22]], i32 7
; ZEROTHRESH-NEXT: ret <8 x float> [[VECINIT7_I]]
;
entry: