mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-26 22:45:05 +00:00
[x86] avoid code explosion from LoopVectorizer for gather loop (PR27826)
By making pointer extraction from a vector more expensive in the cost model, we avoid the vectorization of a loop that is very likely to be memory-bound: https://llvm.org/bugs/show_bug.cgi?id=27826 There are still bugs related to this, so we may need a more general solution to avoid vectorizing obviously memory-bound loops when we don't have HW gather support. Differential Revision: http://reviews.llvm.org/D20601 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@270729 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
c863690403
commit
cab076f44c
@ -963,6 +963,8 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
|
||||
assert(Val->isVectorTy() && "This must be a vector type");
|
||||
|
||||
Type *ScalarType = Val->getScalarType();
|
||||
|
||||
if (Index != -1U) {
|
||||
// Legalize the type.
|
||||
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
|
||||
@ -976,11 +978,17 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
|
||||
Index = Index % Width;
|
||||
|
||||
// Floating point scalars are already located in index #0.
|
||||
if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
|
||||
if (ScalarType->isFloatingPointTy() && Index == 0)
|
||||
return 0;
|
||||
}
|
||||
|
||||
return BaseT::getVectorInstrCost(Opcode, Val, Index);
|
||||
// Add to the base cost if we know that the extracted element of a vector is
|
||||
// destined to be moved to and used in the integer register file.
|
||||
int RegisterFileMoveCost = 0;
|
||||
if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
|
||||
RegisterFileMoveCost = 1;
|
||||
|
||||
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
|
||||
}
|
||||
|
||||
int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
|
||||
|
@ -39,3 +39,44 @@ for.body: ; preds = %for.body, %entry
|
||||
for.end: ; preds = %for.body
|
||||
ret void
|
||||
}
|
||||
|
||||
; This function uses a stride that is generally too big to benefit from vectorization without
|
||||
; really good support for a gather load. We were not computing an accurate cost for the
|
||||
; vectorization and subsequent scalarization of the pointer induction variables.
|
||||
|
||||
define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
|
||||
; CHECK-LABEL: @PR27826(
|
||||
; CHECK-NOT: <4 x float>
|
||||
; CHECK-NOT: <8 x float>
|
||||
; CHECK: ret float %s.0.lcssa
|
||||
|
||||
entry:
|
||||
%cmp = icmp sgt i32 %n, 0
|
||||
br i1 %cmp, label %preheader, label %for.end
|
||||
|
||||
preheader:
|
||||
%t0 = sext i32 %n to i64
|
||||
br label %for
|
||||
|
||||
for:
|
||||
%indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for ]
|
||||
%s.02 = phi float [ 0.0, %preheader ], [ %add4, %for ]
|
||||
%arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
|
||||
%t1 = load float, float* %arrayidx, align 4
|
||||
%arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv
|
||||
%t2 = load float, float* %arrayidx3, align 4
|
||||
%add = fadd fast float %t1, %s.02
|
||||
%add4 = fadd fast float %add, %t2
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
|
||||
%cmp1 = icmp slt i64 %indvars.iv.next, %t0
|
||||
br i1 %cmp1, label %for, label %loopexit
|
||||
|
||||
loopexit:
|
||||
%add4.lcssa = phi float [ %add4, %for ]
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
%s.0.lcssa = phi float [ 0.0, %entry ], [ %add4.lcssa, %loopexit ]
|
||||
ret float %s.0.lcssa
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user