diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 32d22ed5402..a58356542db 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -175,10 +175,12 @@ private:
     bool FindIVForUser(ICmpInst *Cond, IVStrideUse *&CondUse,
                        const SCEVHandle *&CondStride);
 
-    unsigned CheckForIVReuse(const SCEVHandle&, IVExpr&, const Type*,
+    unsigned CheckForIVReuse(bool, const SCEVHandle&,
+                             IVExpr&, const Type*,
                              const std::vector<BasedUser>& UsersToProcess);
 
-    bool ValidStride(int64_t, const std::vector<BasedUser>& UsersToProcess);
+    bool ValidStride(bool, int64_t,
+                     const std::vector<BasedUser>& UsersToProcess);
 
     void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
                                       IVUsersOfOneStride &Uses,
@@ -937,8 +939,8 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
 
 /// isZero - returns true if the scalar evolution expression is zero.
 ///
-static bool isZero(SCEVHandle &V) {
-  if (SCEVConstant *SC = dyn_cast<SCEVConstant>(V))
+static bool isZero(const SCEVHandle &V) {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V))
     return SC->getValue()->isZero();
   return false;
 }
@@ -946,7 +948,8 @@ static bool isZero(SCEVHandle &V) {
 /// ValidStride - Check whether the given Scale is valid for all loads and 
 /// stores in UsersToProcess.
 ///
-bool LoopStrengthReduce::ValidStride(int64_t Scale, 
+bool LoopStrengthReduce::ValidStride(bool HasBaseReg,
+                               int64_t Scale, 
                                const std::vector<BasedUser>& UsersToProcess) {
   for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
     // If this is a load or other access, pass the type of the access in.
@@ -959,6 +962,7 @@ bool LoopStrengthReduce::ValidStride(int64_t Scale,
     TargetLowering::AddrMode AM;
     if (SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
       AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.HasBaseReg = HasBaseReg || !isZero(UsersToProcess[i].Base);
     AM.Scale = Scale;
 
     // If load[imm+r*scale] is illegal, bail out.
@@ -970,9 +974,11 @@ bool LoopStrengthReduce::ValidStride(int64_t Scale,
 
 /// CheckForIVReuse - Returns the multiple if the stride is the multiple
 /// of a previous stride and it is a legal value for the target addressing
-/// mode scale component. This allows the users of this stride to be rewritten
-/// as prev iv * factor. It returns 0 if no reuse is possible.
-unsigned LoopStrengthReduce::CheckForIVReuse(const SCEVHandle &Stride, 
+/// mode scale component and optional base reg. This allows the users of
+/// this stride to be rewritten as prev iv * factor. It returns 0 if no
+/// reuse is possible.
+unsigned LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
+                                const SCEVHandle &Stride, 
                                 IVExpr &IV, const Type *Ty,
                                 const std::vector<BasedUser>& UsersToProcess) {
   if (!TLI) return 0;
@@ -992,7 +998,7 @@ unsigned LoopStrengthReduce::CheckForIVReuse(const SCEVHandle &Stride,
       // stores; if it can be used for some and not others, we might as well use
       // the original stride everywhere, since we have to create the IV for it
       // anyway.
-      if (ValidStride(Scale, UsersToProcess))
+      if (ValidStride(HasBaseReg, Scale, UsersToProcess))
         for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
                IE = SI->second.IVs.end(); II != IE; ++II)
           // FIXME: Only handle base == 0 for now.
@@ -1061,7 +1067,18 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
   // UsersToProcess base values.
   SCEVHandle CommonExprs =
     RemoveCommonExpressionsFromUseBases(UsersToProcess, SE);
+
+  // If we managed to find some expressions in common, we'll need to carry
+  // their value in a register and add it in for each use. This will take up
+  // a register operand, which potentially restricts what stride values are
+  // valid.
+  bool HaveCommonExprs = !isZero(CommonExprs);
   
+  // Keep track if every use in UsersToProcess is an address. If they all are,
+  // we may be able to rewrite the entire collection of them in terms of a
+  // smaller-stride IV.
+  bool AllUsesAreAddresses = true;
+
   // Next, figure out what we can represent in the immediate fields of
   // instructions.  If we can represent anything there, move it to the imm
   // fields of the BasedUsers.  We do this so that it increases the commonality
@@ -1085,29 +1102,53 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
           isAddress = true;
       } else if (IntrinsicInst *II =
                    dyn_cast<IntrinsicInst>(UsersToProcess[i].Inst)) {
-        // Addressing modes can also be folded into prefetches.
-        if (II->getIntrinsicID() == Intrinsic::prefetch &&
-            II->getOperand(1) == UsersToProcess[i].OperandValToReplace)
-          isAddress = true;
+        // Addressing modes can also be folded into prefetches and a variety
+        // of intrinsics.
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::prefetch:
+        case Intrinsic::x86_sse2_loadu_dq:
+        case Intrinsic::x86_sse2_loadu_pd:
+        case Intrinsic::x86_sse_loadu_ps:
+        case Intrinsic::x86_sse_storeu_ps:
+        case Intrinsic::x86_sse2_storeu_pd:
+        case Intrinsic::x86_sse2_storeu_dq:
+        case Intrinsic::x86_sse2_storel_dq:
+          if (II->getOperand(1) == UsersToProcess[i].OperandValToReplace)
+            isAddress = true;
+          break;
+        case Intrinsic::x86_sse2_loadh_pd:
+        case Intrinsic::x86_sse2_loadl_pd:
+          if (II->getOperand(2) == UsersToProcess[i].OperandValToReplace)
+            isAddress = true;
+          break;
+        }
       }
+
+      // If this use isn't an address, then not all uses are addresses.
+      if (!isAddress)
+        AllUsesAreAddresses = false;
       
       MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base,
                           UsersToProcess[i].Imm, isAddress, L, SE);
     }
   }
 
-  // Check if it is possible to reuse a IV with stride that is factor of this
-  // stride. And the multiple is a number that can be encoded in the scale
-  // field of the target addressing mode.  And we will have a valid
-  // instruction after this substition, including the immediate field, if any.
+  // If all uses are addresses, check if it is possible to reuse an IV with a
+  // stride that is a factor of this stride. And that the multiple is a number
+  // that can be encoded in the scale field of the target addressing mode. And
+  // that we will have a valid instruction after this substition, including the
+  // immediate field, if any.
   PHINode *NewPHI = NULL;
   Value   *IncV   = NULL;
   IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
                    SE->getIntegerSCEV(0, Type::Int32Ty),
                    0, 0);
-  unsigned RewriteFactor = CheckForIVReuse(Stride, ReuseIV,
-                                           CommonExprs->getType(),
-                                           UsersToProcess);
+  unsigned RewriteFactor = 0;
+  if (AllUsesAreAddresses)
+    RewriteFactor = CheckForIVReuse(HaveCommonExprs, Stride, ReuseIV,
+                                    CommonExprs->getType(),
+                                    UsersToProcess);
   if (RewriteFactor != 0) {
     DOUT << "BASED ON IV of STRIDE " << *ReuseIV.Stride
          << " and BASE " << *ReuseIV.Base << " :\n";
diff --git a/test/CodeGen/X86/2007-08-10-LEA16Use32.ll b/test/CodeGen/X86/2007-08-10-LEA16Use32.ll
index f6a848255fd..1a0bcf94e5b 100644
--- a/test/CodeGen/X86/2007-08-10-LEA16Use32.ll
+++ b/test/CodeGen/X86/2007-08-10-LEA16Use32.ll
@@ -1,4 +1,8 @@
 ; RUN: llvm-as < %s | llc -march=x86 | grep {leal}
+; XFAIL: *
+; This test is XFAIL'd because strength-reduction was improved to
+; avoid emitting the lea, so it longer tests whether the 16-bit
+; lea is avoided.
 
 @X = global i16 0               ; <i16*> [#uses=1]
 @Y = global i16 0               ; <i16*> [#uses=1]
diff --git a/test/CodeGen/X86/stride-nine-with-base-reg.ll b/test/CodeGen/X86/stride-nine-with-base-reg.ll
new file mode 100644
index 00000000000..f443c76015d
--- /dev/null
+++ b/test/CodeGen/X86/stride-nine-with-base-reg.ll
@@ -0,0 +1,34 @@
+; RUN: llvm-as < %s | llc -march=x86 | grep lea | count 1
+; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea
+
+; For x86 there's an lea above the loop. In both cases, there shouldn't
+; be any lea instructions inside the loop.
+
+@B = external global [1000 x i8], align 32
+@A = external global [1000 x i8], align 32
+@P = external global [1000 x i8], align 32
+
+define void @foo(i32 %m, i32 %p) {
+entry:
+	%tmp1 = icmp sgt i32 %m, 0
+	br i1 %tmp1, label %bb, label %return
+
+bb:
+	%i.019.0 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ]
+	%tmp2 = getelementptr [1000 x i8]* @B, i32 0, i32 %i.019.0
+	%tmp3 = load i8* %tmp2, align 4
+	%tmp4 = mul i8 %tmp3, 2
+	%tmp5 = getelementptr [1000 x i8]* @A, i32 0, i32 %i.019.0
+	store i8 %tmp4, i8* %tmp5, align 4
+	%tmp8 = mul i32 %i.019.0, 9
+        %tmp0 = add i32 %tmp8, %p
+	%tmp10 = getelementptr [1000 x i8]* @P, i32 0, i32 %tmp0
+	store i8 17, i8* %tmp10, align 4
+	%indvar.next = add i32 %i.019.0, 1
+	%exitcond = icmp eq i32 %indvar.next, %m
+	br i1 %exitcond, label %return, label %bb
+
+return:
+	ret void
+}
+
diff --git a/test/CodeGen/X86/stride-reuse.ll b/test/CodeGen/X86/stride-reuse.ll
new file mode 100644
index 00000000000..97f33d8adbc
--- /dev/null
+++ b/test/CodeGen/X86/stride-reuse.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-as < %s | llc -march=x86 | not grep lea
+; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea
+
+@B = external global [1000 x float], align 32
+@A = external global [1000 x float], align 32
+@P = external global [1000 x i32], align 32
+
+define void @foo(i32 %m) {
+entry:
+	%tmp1 = icmp sgt i32 %m, 0
+	br i1 %tmp1, label %bb, label %return
+
+bb:
+	%i.019.0 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ]
+	%tmp2 = getelementptr [1000 x float]* @B, i32 0, i32 %i.019.0
+	%tmp3 = load float* %tmp2, align 4
+	%tmp4 = mul float %tmp3, 2.000000e+00
+	%tmp5 = getelementptr [1000 x float]* @A, i32 0, i32 %i.019.0
+	store float %tmp4, float* %tmp5, align 4
+	%tmp8 = shl i32 %i.019.0, 1
+	%tmp9 = add i32 %tmp8, 64
+	%tmp10 = getelementptr [1000 x i32]* @P, i32 0, i32 %i.019.0
+	store i32 %tmp9, i32* %tmp10, align 4
+	%indvar.next = add i32 %i.019.0, 1
+	%exitcond = icmp eq i32 %indvar.next, %m
+	br i1 %exitcond, label %return, label %bb
+
+return:
+	ret void
+}