From df1f166e4a00944685121e3f811737acef653020 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 22 Oct 2007 20:40:42 +0000 Subject: [PATCH] Strength reduction improvements. - Avoid attempting stride-reuse in the case that there are users that aren't addresses. In that case, there will be places where the multiplications won't be folded away, so it's better to try to strength-reduce them. - Several SSE intrinsics have operands that strength-reduction can treat as addresses. The previous item makes this more visible, as any non-address use of an IV can inhibit stride-reuse. - Make ValidStride aware of whether there's likely to be a base register in the address computation. This prevents it from thinking that things like stride 9 are valid on x86 when the base register is already occupied. Also, XFAIL the 2007-08-10-LEA16Use32.ll test; the new logic to avoid stride-reuse elimintes the LEA in the loop, so the test is no longer testing what it was intended to test. llvm-svn: 43231 --- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 81 ++++++++++++++----- test/CodeGen/X86/2007-08-10-LEA16Use32.ll | 4 + test/CodeGen/X86/stride-nine-with-base-reg.ll | 34 ++++++++ test/CodeGen/X86/stride-reuse.ll | 30 +++++++ 4 files changed, 129 insertions(+), 20 deletions(-) create mode 100644 test/CodeGen/X86/stride-nine-with-base-reg.ll create mode 100644 test/CodeGen/X86/stride-reuse.ll diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 32d22ed5402..a58356542db 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -175,10 +175,12 @@ private: bool FindIVForUser(ICmpInst *Cond, IVStrideUse *&CondUse, const SCEVHandle *&CondStride); - unsigned CheckForIVReuse(const SCEVHandle&, IVExpr&, const Type*, + unsigned CheckForIVReuse(bool, const SCEVHandle&, + IVExpr&, const Type*, const std::vector& UsersToProcess); - bool ValidStride(int64_t, const std::vector& UsersToProcess); + bool ValidStride(bool, int64_t, + const std::vector& UsersToProcess); void StrengthReduceStridedIVUsers(const SCEVHandle &Stride, IVUsersOfOneStride &Uses, @@ -937,8 +939,8 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, /// isZero - returns true if the scalar evolution expression is zero. /// -static bool isZero(SCEVHandle &V) { - if (SCEVConstant *SC = dyn_cast(V)) +static bool isZero(const SCEVHandle &V) { + if (const SCEVConstant *SC = dyn_cast(V)) return SC->getValue()->isZero(); return false; } @@ -946,7 +948,8 @@ static bool isZero(SCEVHandle &V) { /// ValidStride - Check whether the given Scale is valid for all loads and /// stores in UsersToProcess. /// -bool LoopStrengthReduce::ValidStride(int64_t Scale, +bool LoopStrengthReduce::ValidStride(bool HasBaseReg, + int64_t Scale, const std::vector& UsersToProcess) { for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) { // If this is a load or other access, pass the type of the access in. @@ -959,6 +962,7 @@ bool LoopStrengthReduce::ValidStride(int64_t Scale, TargetLowering::AddrMode AM; if (SCEVConstant *SC = dyn_cast(UsersToProcess[i].Imm)) AM.BaseOffs = SC->getValue()->getSExtValue(); + AM.HasBaseReg = HasBaseReg || !isZero(UsersToProcess[i].Base); AM.Scale = Scale; // If load[imm+r*scale] is illegal, bail out. @@ -970,9 +974,11 @@ bool LoopStrengthReduce::ValidStride(int64_t Scale, /// CheckForIVReuse - Returns the multiple if the stride is the multiple /// of a previous stride and it is a legal value for the target addressing -/// mode scale component. This allows the users of this stride to be rewritten -/// as prev iv * factor. It returns 0 if no reuse is possible. -unsigned LoopStrengthReduce::CheckForIVReuse(const SCEVHandle &Stride, +/// mode scale component and optional base reg. This allows the users of +/// this stride to be rewritten as prev iv * factor. It returns 0 if no +/// reuse is possible. +unsigned LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, + const SCEVHandle &Stride, IVExpr &IV, const Type *Ty, const std::vector& UsersToProcess) { if (!TLI) return 0; @@ -992,7 +998,7 @@ unsigned LoopStrengthReduce::CheckForIVReuse(const SCEVHandle &Stride, // stores; if it can be used for some and not others, we might as well use // the original stride everywhere, since we have to create the IV for it // anyway. - if (ValidStride(Scale, UsersToProcess)) + if (ValidStride(HasBaseReg, Scale, UsersToProcess)) for (std::vector::iterator II = SI->second.IVs.begin(), IE = SI->second.IVs.end(); II != IE; ++II) // FIXME: Only handle base == 0 for now. @@ -1061,7 +1067,18 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // UsersToProcess base values. SCEVHandle CommonExprs = RemoveCommonExpressionsFromUseBases(UsersToProcess, SE); + + // If we managed to find some expressions in common, we'll need to carry + // their value in a register and add it in for each use. This will take up + // a register operand, which potentially restricts what stride values are + // valid. + bool HaveCommonExprs = !isZero(CommonExprs); + // Keep track if every use in UsersToProcess is an address. If they all are, + // we may be able to rewrite the entire collection of them in terms of a + // smaller-stride IV. + bool AllUsesAreAddresses = true; + // Next, figure out what we can represent in the immediate fields of // instructions. If we can represent anything there, move it to the imm // fields of the BasedUsers. We do this so that it increases the commonality @@ -1085,29 +1102,53 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, isAddress = true; } else if (IntrinsicInst *II = dyn_cast(UsersToProcess[i].Inst)) { - // Addressing modes can also be folded into prefetches. - if (II->getIntrinsicID() == Intrinsic::prefetch && - II->getOperand(1) == UsersToProcess[i].OperandValToReplace) - isAddress = true; + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::prefetch: + case Intrinsic::x86_sse2_loadu_dq: + case Intrinsic::x86_sse2_loadu_pd: + case Intrinsic::x86_sse_loadu_ps: + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + if (II->getOperand(1) == UsersToProcess[i].OperandValToReplace) + isAddress = true; + break; + case Intrinsic::x86_sse2_loadh_pd: + case Intrinsic::x86_sse2_loadl_pd: + if (II->getOperand(2) == UsersToProcess[i].OperandValToReplace) + isAddress = true; + break; + } } + + // If this use isn't an address, then not all uses are addresses. + if (!isAddress) + AllUsesAreAddresses = false; MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base, UsersToProcess[i].Imm, isAddress, L, SE); } } - // Check if it is possible to reuse a IV with stride that is factor of this - // stride. And the multiple is a number that can be encoded in the scale - // field of the target addressing mode. And we will have a valid - // instruction after this substition, including the immediate field, if any. + // If all uses are addresses, check if it is possible to reuse an IV with a + // stride that is a factor of this stride. And that the multiple is a number + // that can be encoded in the scale field of the target addressing mode. And + // that we will have a valid instruction after this substition, including the + // immediate field, if any. PHINode *NewPHI = NULL; Value *IncV = NULL; IVExpr ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty), SE->getIntegerSCEV(0, Type::Int32Ty), 0, 0); - unsigned RewriteFactor = CheckForIVReuse(Stride, ReuseIV, - CommonExprs->getType(), - UsersToProcess); + unsigned RewriteFactor = 0; + if (AllUsesAreAddresses) + RewriteFactor = CheckForIVReuse(HaveCommonExprs, Stride, ReuseIV, + CommonExprs->getType(), + UsersToProcess); if (RewriteFactor != 0) { DOUT << "BASED ON IV of STRIDE " << *ReuseIV.Stride << " and BASE " << *ReuseIV.Base << " :\n"; diff --git a/test/CodeGen/X86/2007-08-10-LEA16Use32.ll b/test/CodeGen/X86/2007-08-10-LEA16Use32.ll index f6a848255fd..1a0bcf94e5b 100644 --- a/test/CodeGen/X86/2007-08-10-LEA16Use32.ll +++ b/test/CodeGen/X86/2007-08-10-LEA16Use32.ll @@ -1,4 +1,8 @@ ; RUN: llvm-as < %s | llc -march=x86 | grep {leal} +; XFAIL: * +; This test is XFAIL'd because strength-reduction was improved to +; avoid emitting the lea, so it longer tests whether the 16-bit +; lea is avoided. @X = global i16 0 ; [#uses=1] @Y = global i16 0 ; [#uses=1] diff --git a/test/CodeGen/X86/stride-nine-with-base-reg.ll b/test/CodeGen/X86/stride-nine-with-base-reg.ll new file mode 100644 index 00000000000..f443c76015d --- /dev/null +++ b/test/CodeGen/X86/stride-nine-with-base-reg.ll @@ -0,0 +1,34 @@ +; RUN: llvm-as < %s | llc -march=x86 | grep lea | count 1 +; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea + +; For x86 there's an lea above the loop. In both cases, there shouldn't +; be any lea instructions inside the loop. + +@B = external global [1000 x i8], align 32 +@A = external global [1000 x i8], align 32 +@P = external global [1000 x i8], align 32 + +define void @foo(i32 %m, i32 %p) { +entry: + %tmp1 = icmp sgt i32 %m, 0 + br i1 %tmp1, label %bb, label %return + +bb: + %i.019.0 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ] + %tmp2 = getelementptr [1000 x i8]* @B, i32 0, i32 %i.019.0 + %tmp3 = load i8* %tmp2, align 4 + %tmp4 = mul i8 %tmp3, 2 + %tmp5 = getelementptr [1000 x i8]* @A, i32 0, i32 %i.019.0 + store i8 %tmp4, i8* %tmp5, align 4 + %tmp8 = mul i32 %i.019.0, 9 + %tmp0 = add i32 %tmp8, %p + %tmp10 = getelementptr [1000 x i8]* @P, i32 0, i32 %tmp0 + store i8 17, i8* %tmp10, align 4 + %indvar.next = add i32 %i.019.0, 1 + %exitcond = icmp eq i32 %indvar.next, %m + br i1 %exitcond, label %return, label %bb + +return: + ret void +} + diff --git a/test/CodeGen/X86/stride-reuse.ll b/test/CodeGen/X86/stride-reuse.ll new file mode 100644 index 00000000000..97f33d8adbc --- /dev/null +++ b/test/CodeGen/X86/stride-reuse.ll @@ -0,0 +1,30 @@ +; RUN: llvm-as < %s | llc -march=x86 | not grep lea +; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea + +@B = external global [1000 x float], align 32 +@A = external global [1000 x float], align 32 +@P = external global [1000 x i32], align 32 + +define void @foo(i32 %m) { +entry: + %tmp1 = icmp sgt i32 %m, 0 + br i1 %tmp1, label %bb, label %return + +bb: + %i.019.0 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ] + %tmp2 = getelementptr [1000 x float]* @B, i32 0, i32 %i.019.0 + %tmp3 = load float* %tmp2, align 4 + %tmp4 = mul float %tmp3, 2.000000e+00 + %tmp5 = getelementptr [1000 x float]* @A, i32 0, i32 %i.019.0 + store float %tmp4, float* %tmp5, align 4 + %tmp8 = shl i32 %i.019.0, 1 + %tmp9 = add i32 %tmp8, 64 + %tmp10 = getelementptr [1000 x i32]* @P, i32 0, i32 %i.019.0 + store i32 %tmp9, i32* %tmp10, align 4 + %indvar.next = add i32 %i.019.0, 1 + %exitcond = icmp eq i32 %indvar.next, %m + br i1 %exitcond, label %return, label %bb + +return: + ret void +}