Extend memcpy expansion in Transform/Utils to handle wider operand types.

Adds loop expansions for known-size and unknown-sized memcpy calls, allowing the target to provide the operand types through TTI callbacks. The default values for the TTI callbacks use int8 operand types and matches the existing behaviour if they aren't overridden by the target. Differential revision: https://reviews.llvm.org/D32536 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307346 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-03 17:32:59 +00:00 · 2017-07-07 02:00:06 +00:00 · 2017-07-07 02:00:06 +00:00 · 471398ffea
commit 471398ffea
parent 104fd8eec7
9 changed files with 489 additions and 24 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -753,6 +753,28 @@ public:
  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                           Type *ExpectedType) const;

+  /// \returns The type to use in a loop expansion of a memcpy call.
+  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                  unsigned SrcAlign, unsigned DestAlign) const;
+
+  /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
+  /// \param RemainingBytes The number of bytes to copy.
+  ///
+  /// Calculates the operand types to use when copying \p RemainingBytes of
+  /// memory, where source and destination alignments are \p SrcAlign and
+  /// \p DestAlign respectively.
+  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
+                                         LLVMContext &Context,
+                                         unsigned RemainingBytes,
+                                         unsigned SrcAlign,
+                                         unsigned DestAlign) const;
+
+  /// \returns True if we want to test the new memcpy lowering functionality in
+  /// Transform/Utils.
+  /// Temporary. Will be removed once we move to the new functionality and
+  /// remove the old.
+  bool useWideIRMemcpyLoopLowering() const;
+
  /// \returns True if the two functions have compatible attributes for inlining
  /// purposes.
  bool areInlineCompatible(const Function *Caller,
@ -953,6 +975,12 @@ public:
  virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
  virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                                   Type *ExpectedType) = 0;
+  virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                          unsigned SrcAlign,
+                                          unsigned DestAlign) const = 0;
+  virtual void getMemcpyLoopResidualLoweringType(
+      SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+      unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0;
  virtual bool areInlineCompatible(const Function *Caller,
                                   const Function *Callee) const = 0;
  virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
@ -1266,6 +1294,19 @@ public:
                                           Type *ExpectedType) override {
    return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
  }
+  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                  unsigned SrcAlign,
+                                  unsigned DestAlign) const override {
+    return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAlign, DestAlign);
+  }
+  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
+                                         LLVMContext &Context,
+                                         unsigned RemainingBytes,
+                                         unsigned SrcAlign,
+                                         unsigned DestAlign) const override {
+    Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
+                                           SrcAlign, DestAlign);
+  }
  bool areInlineCompatible(const Function *Caller,
                           const Function *Callee) const override {
    return Impl.areInlineCompatible(Caller, Callee);
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -444,6 +444,20 @@ public:
    return nullptr;
  }

+  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                  unsigned SrcAlign, unsigned DestAlign) const {
+    return Type::getInt8Ty(Context);
+  }
+
+  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
+                                         LLVMContext &Context,
+                                         unsigned RemainingBytes,
+                                         unsigned SrcAlign,
+                                         unsigned DestAlign) const {
+    for (unsigned i = 0; i != RemainingBytes; ++i)
+      OpsOut.push_back(Type::getInt8Ty(Context));
+  }
+
  bool areInlineCompatible(const Function *Caller,
                           const Function *Callee) const {
    return (Caller->getFnAttribute("target-cpu") ==
--- a/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@ -17,21 +17,39 @@

 namespace llvm {

+class ConstantInt;
 class Instruction;
 class MemCpyInst;
 class MemMoveInst;
 class MemSetInst;
+class TargetTransformInfo;
 class Value;

 /// Emit a loop implementing the semantics of llvm.memcpy with the equivalent
 /// arguments at \p InsertBefore.
-void createMemCpyLoop(Instruction *InsertBefore,
-                      Value *SrcAddr, Value *DstAddr, Value *CopyLen,
-                      unsigned SrcAlign, unsigned DestAlign,
+void createMemCpyLoop(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
+                      Value *CopyLen, unsigned SrcAlign, unsigned DestAlign,
                      bool SrcIsVolatile, bool DstIsVolatile);

+/// Emit a loop implementing the semantics of llvm.memcpy where the size is not
+/// a compile-time constant. Loop will be insterted at \p InsertBefore.
+void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr,
+                                 Value *DstAddr, Value *CopyLen,
+                                 unsigned SrcAlign, unsigned DestAlign,
+                                 bool SrcIsVolatile, bool DstIsVolatile,
+                                 const TargetTransformInfo &TTI);
+
+/// Emit a loop implementing the semantics of an llvm.memcpy whose size is a
+/// compile time constant. Loop is inserted at \p InsertBefore.
+void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+                               Value *DstAddr, ConstantInt *CopyLen,
+                               unsigned SrcAlign, unsigned DestAlign,
+                               bool SrcIsVolatile, bool DstIsVolatile,
+                               const TargetTransformInfo &TTI);
+
+
 /// Expand \p MemCpy as a loop. \p MemCpy is not deleted.
-void expandMemCpyAsLoop(MemCpyInst *MemCpy);
+void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI);

 /// Expand \p MemMove as a loop. \p MemMove is not deleted.
 void expandMemMoveAsLoop(MemMoveInst *MemMove);
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@ -16,6 +16,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <utility>

@ -23,6 +24,11 @@ using namespace llvm;

 #define DEBUG_TYPE "tti"

+static cl::opt<bool> UseWideMemcpyLoopLowering(
+    "use-wide-memcpy-loop-lowering", cl::init(false),
+    cl::desc("Enables the new wide memcpy loop lowering in Transforms/Utils."),
+    cl::Hidden);
+
 namespace {
 /// \brief No-op implementation of the TTI interface using the utility base
 /// classes.
@ -482,6 +488,25 @@ Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
  return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
 }

+Type *TargetTransformInfo::getMemcpyLoopLoweringType(LLVMContext &Context,
+                                                     Value *Length,
+                                                     unsigned SrcAlign,
+                                                     unsigned DestAlign) const {
+  return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAlign,
+                                            DestAlign);
+}
+
+void TargetTransformInfo::getMemcpyLoopResidualLoweringType(
+    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+    unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const {
+  TTIImpl->getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
+                                             SrcAlign, DestAlign);
+}
+
+bool TargetTransformInfo::useWideIRMemcpyLoopLowering() const {
+  return UseWideMemcpyLoopLowering;
+}
+
 bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
                                              const Function *Callee) const {
  return TTIImpl->areInlineCompatible(Caller, Callee);
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@ -10,6 +10,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@ -34,9 +35,14 @@ public:
  AMDGPULowerIntrinsics() : ModulePass(ID) {}

  bool runOnModule(Module &M) override;
+  bool expandMemIntrinsicUses(Function &F);
  StringRef getPassName() const override {
    return "AMDGPU Lower Intrinsics";
  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
 };

 }
@ -55,7 +61,7 @@ static bool shouldExpandOperationWithSize(Value *Size) {
  return !CI || (CI->getZExtValue() > MaxStaticSize);
 }

-static bool expandMemIntrinsicUses(Function &F) {
+bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
  Intrinsic::ID ID = F.getIntrinsicID();
  bool Changed = false;

@ -67,7 +73,10 @@ static bool expandMemIntrinsicUses(Function &F) {
    case Intrinsic::memcpy: {
      auto *Memcpy = cast<MemCpyInst>(Inst);
      if (shouldExpandOperationWithSize(Memcpy->getLength())) {
-        expandMemCpyAsLoop(Memcpy);
+        Function *ParentFunc = Memcpy->getParent()->getParent();
+        const TargetTransformInfo &TTI =
+            getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
+        expandMemCpyAsLoop(Memcpy, TTI);
        Changed = true;
        Memcpy->eraseFromParent();
      }
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//

 #include "NVPTXLowerAggrCopies.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@ -42,6 +43,7 @@ struct NVPTXLowerAggrCopies : public FunctionPass {

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addPreserved<StackProtector>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
  }

  bool runOnFunction(Function &F) override;
@ -61,6 +63,8 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {

  const DataLayout &DL = F.getParent()->getDataLayout();
  LLVMContext &Context = F.getParent()->getContext();
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

  // Collect all aggregate loads and mem* calls.
  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
@ -104,15 +108,26 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
    Value *SrcAddr = LI->getOperand(0);
    Value *DstAddr = SI->getOperand(1);
    unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
-    Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
+    ConstantInt *CopyLen =
+        ConstantInt::get(Type::getInt32Ty(Context), NumLoads);

-    createMemCpyLoop(/* ConvertedInst */ SI,
-                     /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
-                     /* CopyLen */ CopyLen,
-                     /* SrcAlign */ LI->getAlignment(),
-                     /* DestAlign */ SI->getAlignment(),
-                     /* SrcIsVolatile */ LI->isVolatile(),
-                     /* DstIsVolatile */ SI->isVolatile());
+    if (!TTI.useWideIRMemcpyLoopLowering()) {
+      createMemCpyLoop(/* ConvertedInst */ SI,
+                       /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                       /* CopyLen */ CopyLen,
+                       /* SrcAlign */ LI->getAlignment(),
+                       /* DestAlign */ SI->getAlignment(),
+                       /* SrcIsVolatile */ LI->isVolatile(),
+                       /* DstIsVolatile */ SI->isVolatile());
+    } else {
+      createMemCpyLoopKnownSize(/* ConvertedInst */ SI,
+                                /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                                /* CopyLen */ CopyLen,
+                                /* SrcAlign */ LI->getAlignment(),
+                                /* DestAlign */ SI->getAlignment(),
+                                /* SrcIsVolatile */ LI->isVolatile(),
+                                /* DstIsVolatile */ SI->isVolatile(), TTI);
+    }

    SI->eraseFromParent();
    LI->eraseFromParent();
@ -121,7 +136,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
  // Transform mem* intrinsic calls.
  for (MemIntrinsic *MemCall : MemCalls) {
    if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
-      expandMemCpyAsLoop(Memcpy);
+      expandMemCpyAsLoop(Memcpy, TTI);
    } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
      expandMemMoveAsLoop(Memmove);
    } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@ -8,12 +8,256 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"

 using namespace llvm;

+static unsigned getLoopOperandSizeInBytes(Type *Type) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Type)) {
+    return VTy->getBitWidth() / 8;
+  }
+
+  return Type->getPrimitiveSizeInBits() / 8;
+}
+
+void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+                                     Value *DstAddr, ConstantInt *CopyLen,
+                                     unsigned SrcAlign, unsigned DestAlign,
+                                     bool SrcIsVolatile, bool DstIsVolatile,
+                                     const TargetTransformInfo &TTI) {
+  // No need to expand zero length copies.
+  if (CopyLen->isZero())
+    return;
+
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB = nullptr;
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  if (LoopEndCount != 0) {
+    // Split
+    PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
+    BasicBlock *LoopBB =
+        BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
+    PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+    IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+    // Cast the Src and Dst pointers to pointers to the loop operand type (if
+    // needed).
+    PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+    PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+    if (SrcAddr->getType() != SrcOpType) {
+      SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+    }
+    if (DstAddr->getType() != DstOpType) {
+      DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+    }
+
+    IRBuilder<> LoopBuilder(LoopBB);
+    PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
+    LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
+    // Loop Body
+    Value *SrcGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+    Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+    LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *NewIndex =
+        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+    LoopIndex->addIncoming(NewIndex, LoopBB);
+
+    // Create the loop branch condition.
+    Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
+                             LoopBB, PostLoopBB);
+  }
+
+  uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
+  if (RemainingBytes) {
+    IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
+                                    : InsertBefore);
+
+    // Update the alignment based on the copy size used in the loop body.
+    SrcAlign = std::min(SrcAlign, LoopOpSize);
+    DestAlign = std::min(DestAlign, LoopOpSize);
+
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAlign, DestAlign);
+
+    for (auto OpTy : RemainingOps) {
+      // Calaculate the new index
+      unsigned OperandSize = getLoopOperandSizeInBytes(OpTy);
+      uint64_t GepIndex = BytesCopied / OperandSize;
+      assert(GepIndex * OperandSize == BytesCopied &&
+             "Division should have no Remainder!");
+      // Cast source to operand type and load
+      PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS);
+      Value *CastedSrc = SrcAddr->getType() == SrcPtrType
+                             ? SrcAddr
+                             : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
+      Value *SrcGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      Value *Load = RBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+
+      // Cast destination to operand type and store.
+      PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
+      Value *CastedDst = DstAddr->getType() == DstPtrType
+                             ? DstAddr
+                             : RBuilder.CreateBitCast(DstAddr, DstPtrType);
+      Value *DstGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      RBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+      BytesCopied += OperandSize;
+    }
+  }
+  assert(BytesCopied == CopyLen->getZExtValue() &&
+         "Bytes copied should match size in the call!");
+}
+
+void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       Value *CopyLen, unsigned SrcAlign,
+                                       unsigned DestAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB =
+      PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
+
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+
+  IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+  PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+  if (SrcAddr->getType() != SrcOpType) {
+    SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+  }
+  if (DstAddr->getType() != DstOpType) {
+    DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+  }
+
+  // Calculate the loop trip count, and remaining bytes to copy after the loop.
+  Type *CopyLenType = CopyLen->getType();
+  IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
+  assert(ILengthType &&
+         "expected size argument to memcpy to be an integer type!");
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  Value *RuntimeLoopCount = PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+  Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+  Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+
+  BasicBlock *LoopBB =
+      BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, nullptr);
+  IRBuilder<> LoopBuilder(LoopBB);
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
+  LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
+
+  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+  Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+  LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  if (LoopOpType != Int8Type) {
+    // Loop body for the residual copy.
+    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
+                                               PreLoopBB->getParent(), nullptr);
+    // Residual loop header.
+    BasicBlock *ResHeaderBB = BasicBlock::Create(
+        Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
+
+    // Need to update the pre-loop basic block to branch to the correct place.
+    // branch to the main loop if the count is non-zero, branch to the residual
+    // loop if the copy size is smaller then 1 iteration of the main loop but
+    // non-zero and finally branch to after the residual loop if the memcpy
+    //  size is zero.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, ResHeaderBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        ResHeaderBB);
+
+    // Determine if we need to branch to the residual loop or bypass it.
+    IRBuilder<> RHBuilder(ResHeaderBB);
+    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+                           ResLoopBB, PostLoopBB);
+
+    // Copy the residual with single byte load/store loop.
+    IRBuilder<> ResBuilder(ResLoopBB);
+    PHINode *ResidualIndex =
+        ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
+    ResidualIndex->addIncoming(Zero, ResHeaderBB);
+
+    Value *SrcAsInt8 =
+        ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
+    Value *DstAsInt8 =
+        ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
+    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
+    Value *SrcGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
+    Value *Load = ResBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
+    ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *ResNewIndex =
+        ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
+    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
+
+    // Create the loop branch condition.
+    ResBuilder.CreateCondBr(
+        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+        PostLoopBB);
+  } else {
+    // In this case the loop operand type was a byte, and there is no need for a
+    // residual loop to copy the remaining memory after the main loop.
+    // We do however need to patch up the control flow by creating the
+    // terminators for the preloop block and the memcpy loop.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, PostLoopBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        PostLoopBB);
+  }
+}
+
 void llvm::createMemCpyLoop(Instruction *InsertBefore,
                            Value *SrcAddr, Value *DstAddr, Value *CopyLen,
                            unsigned SrcAlign, unsigned DestAlign,
@ -208,15 +452,41 @@ static void createMemSetLoop(Instruction *InsertBefore,
                           NewBB);
 }

-void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy) {
-  createMemCpyLoop(/* InsertBefore */ Memcpy,
-                   /* SrcAddr */ Memcpy->getRawSource(),
-                   /* DstAddr */ Memcpy->getRawDest(),
-                   /* CopyLen */ Memcpy->getLength(),
-                   /* SrcAlign */ Memcpy->getAlignment(),
-                   /* DestAlign */ Memcpy->getAlignment(),
-                   /* SrcIsVolatile */ Memcpy->isVolatile(),
-                   /* DstIsVolatile */ Memcpy->isVolatile());
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
+                              const TargetTransformInfo &TTI) {
+  // Original implementation
+  if (!TTI.useWideIRMemcpyLoopLowering()) {
+    createMemCpyLoop(/* InsertBefore */ Memcpy,
+                     /* SrcAddr */ Memcpy->getRawSource(),
+                     /* DstAddr */ Memcpy->getRawDest(),
+                     /* CopyLen */ Memcpy->getLength(),
+                     /* SrcAlign */ Memcpy->getAlignment(),
+                     /* DestAlign */ Memcpy->getAlignment(),
+                     /* SrcIsVolatile */ Memcpy->isVolatile(),
+                     /* DstIsVolatile */ Memcpy->isVolatile());
+  } else {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
+      createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
+                                /* SrcAddr */ Memcpy->getRawSource(),
+                                /* DstAddr */ Memcpy->getRawDest(),
+                                /* CopyLen */ CI,
+                                /* SrcAlign */ Memcpy->getAlignment(),
+                                /* DestAlign */ Memcpy->getAlignment(),
+                                /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                /* DstIsVolatile */ Memcpy->isVolatile(),
+                                /* TargetTransformInfo */ TTI);
+    } else {
+      createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
+                                  /* SrcAddr */ Memcpy->getRawSource(),
+                                  /* DstAddr */ Memcpy->getRawDest(),
+                                  /* CopyLen */ Memcpy->getLength(),
+                                  /* SrcAlign */ Memcpy->getAlignment(),
+                                  /* DestAlign */ Memcpy->getAlignment(),
+                                  /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                  /* DstIsVolatile */ Memcpy->isVolatile(),
+                                  /* TargetTransfomrInfo */ TTI);
+    }
+  }
 }

 void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
--- a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@ -1,4 +1,5 @@
 ; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s

 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
@ -21,6 +22,17 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)*
 ; OPT-NEXT: load i8
 ; OPT: getelementptr
 ; OPT-NEXT: store i8
+
+; WOPT-LABEL: @min_size_large_static_memcpy_caller0(
+; WOPT-NOT: call
+; WOPT: br label %load-store-loop
+; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index
+; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]]
+; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index
+; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]]
+; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1
+; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025
+; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split
 define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
  ret void
--- a/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
 ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
+; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR

 ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
 ; llvm.mem* intrinsics get lowered to loops.
@ -32,6 +33,23 @@ entry:
 ; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
+
+; WIR-LABEL:   @memcpy_caller
+; WIR:         entry:
+; WIR:         [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
+; WIR:         [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
+; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; WIR:         loop-memcpy-expansion:
+; WIR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; WIR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR:         [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR:         store i8 [[Load]], i8* [[DstGep]]
+; WIR:         [[IndexInc]] = add i64 %loop-index, 1
+; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
 }

 define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@ -50,6 +68,23 @@ entry:
 ; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
+
+; WIR-LABEL:   @memcpy_volatile_caller
+; WIR:         entry:
+; WIR:         [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
+; WIR:         [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
+; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; WIR:         loop-memcpy-expansion:
+; WIR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; WIR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR:         [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
+; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR:         store volatile i8 [[Load]], i8* [[DstGep]]
+; WIR:         [[IndexInc]] = add i64 %loop-index, 1
+; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
 }

 define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@ -65,6 +100,32 @@ entry:
 ; IR:         [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
 ; IR:         getelementptr inbounds i8, i8* [[SRCCAST]]
 ; IR:         getelementptr inbounds i8, i8* [[DSTCAST]]
+
+; WIR-LABEL:   @memcpy_casting_caller
+; WIR:         [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8*
+; WIR:         [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
+; WIR:         getelementptr inbounds i8, i8* [[SRCCAST]]
+; WIR:         getelementptr inbounds i8, i8* [[DSTCAST]]
+}
+
+define i8* @memcpy_known_size(i8* %dst, i8* %src) {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 144, i32 1, i1 false)
+  ret i8* %dst
+
+; Check that calls with compile-time constant size are handled correctly
+; WIR-LABEL:    @memcpy_known_size
+; WIR:          entry:
+; WIR:          br label %load-store-loop
+; WIR:          load-store-loop:
+; WIR:          %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; WIR:          [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR:          [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; WIR:          [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR:          store i8 [[Load]], i8* [[DstGep]]
+; WIR:          [[IndexInc]] = add i64 %loop-index, 1
+; WIR:          [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
+; WIR:          br i1 [[Cond]], label %load-store-loop, label %memcpy-split
 }

 define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {