diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index f4d6665e835..33d17ef704f 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3749,15 +3749,54 @@ bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
 }
 
 bool AArch64FastISel::selectMul(const Instruction *I) {
-  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-  if (!SrcEVT.isSimple())
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
     return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
 
-  // Must be simple value type.  Don't handle vectors.
-  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
-      SrcVT != MVT::i8)
-    return false;
+  if (VT.isVector())
+    return selectBinaryOp(I, ISD::MUL);
+
+  const Value *Src0 = I->getOperand(0);
+  const Value *Src1 = I->getOperand(1);
+  if (const auto *C = dyn_cast<ConstantInt>(Src0))
+    if (C->getValue().isPowerOf2())
+      std::swap(Src0, Src1);
+
+  // Try to simplify to a shift instruction.
+  if (const auto *C = dyn_cast<ConstantInt>(Src1))
+    if (C->getValue().isPowerOf2()) {
+      uint64_t ShiftVal = C->getValue().logBase2();
+      MVT SrcVT = VT;
+      bool IsZExt = true;
+      if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) {
+        MVT VT;
+        if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) {
+          SrcVT = VT;
+          IsZExt = true;
+          Src0 = ZExt->getOperand(0);
+        }
+      } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) {
+        MVT VT;
+        if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) {
+          SrcVT = VT;
+          IsZExt = false;
+          Src0 = SExt->getOperand(0);
+        }
+      }
+
+      unsigned Src0Reg = getRegForValue(Src0);
+      if (!Src0Reg)
+        return false;
+      bool Src0IsKill = hasTrivialKill(Src0);
+
+      unsigned ResultReg =
+          emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
+
+      if (ResultReg) {
+        updateValueMap(I, ResultReg);
+        return true;
+      }
+    }
 
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
@@ -3769,8 +3808,7 @@ bool AArch64FastISel::selectMul(const Instruction *I) {
     return false;
   bool Src1IsKill = hasTrivialKill(I->getOperand(1));
 
-  unsigned ResultReg =
-      emitMul_rr(SrcVT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
+  unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
 
   if (!ResultReg)
     return false;
@@ -3950,9 +3988,7 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   case Instruction::Sub:
     return selectAddSub(I);
   case Instruction::Mul:
-    if (!selectBinaryOp(I, ISD::MUL))
-      return selectMul(I);
-    return true;
+    return selectMul(I);
   case Instruction::SRem:
     if (!selectBinaryOp(I, ISD::SREM))
       return selectRem(I, ISD::SREM);
diff --git a/test/CodeGen/AArch64/fast-isel-mul.ll b/test/CodeGen/AArch64/fast-isel-mul.ll
index 88a3ee8db3d..f2fda27c2f7 100644
--- a/test/CodeGen/AArch64/fast-isel-mul.ll
+++ b/test/CodeGen/AArch64/fast-isel-mul.ll
@@ -1,40 +1,44 @@
-; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
 
-@var8 = global i8 0
-@var16 = global i16 0
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_mul8(i8 %lhs, i8 %rhs) {
+define zeroext i8 @test_mul8(i8 %lhs, i8 %rhs) {
 ; CHECK-LABEL: test_mul8:
-; CHECK: mul {{w[0-9]+}}, w0, w1
-;  %lhs = load i8* @var8
-;  %rhs = load i8* @var8
-  %prod = mul i8 %lhs, %rhs
-  store i8 %prod, i8* @var8
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i8 %lhs, %rhs
+  ret i8 %1
 }
 
-define void @test_mul16(i16 %lhs, i16 %rhs) {
+define zeroext i16 @test_mul16(i16 %lhs, i16 %rhs) {
 ; CHECK-LABEL: test_mul16:
-; CHECK: mul {{w[0-9]+}}, w0, w1
-  %prod = mul i16 %lhs, %rhs
-  store i16 %prod, i16* @var16
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i16 %lhs, %rhs
+  ret i16 %1
 }
 
-define void @test_mul32(i32 %lhs, i32 %rhs) {
+define i32 @test_mul32(i32 %lhs, i32 %rhs) {
 ; CHECK-LABEL: test_mul32:
-; CHECK: mul {{w[0-9]+}}, w0, w1
-  %prod = mul i32 %lhs, %rhs
-  store i32 %prod, i32* @var32
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i32 %lhs, %rhs
+  ret i32 %1
 }
 
-define void @test_mul64(i64 %lhs, i64 %rhs) {
+define i64 @test_mul64(i64 %lhs, i64 %rhs) {
 ; CHECK-LABEL: test_mul64:
-; CHECK: mul {{x[0-9]+}}, x0, x1
-  %prod = mul i64 %lhs, %rhs
-  store i64 %prod, i64* @var64
-  ret void
+; CHECK:       mul {{x[0-9]+}}, x0, x1
+  %1 = mul i64 %lhs, %rhs
+  ret i64 %1
 }
+
+define i32 @test_mul2shift_i32(i32 %a) {
+; CHECK-LABEL: test_mul2shift_i32:
+; CHECK:       lsl {{w[0-9]+}}, w0, #2
+  %1 = mul i32 %a, 4
+  ret i32 %1
+}
+
+define i64 @test_mul2shift_i64(i64 %a) {
+; CHECK-LABEL: test_mul2shift_i64:
+; CHECK:       lsl {{x[0-9]+}}, x0, #3
+  %1 = mul i64 %a, 8
+  ret i64 %1
+}
+