CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering

Expand large or unknown size memory intrinsics into loops in the default lowering pipeline if the target doesn't have the corresponding libfunc. Previously AMDGPU had a custom pass which existed to call the expansion utilities. With a default no-libcall option, we can remove the libfunc checks in LoopIdiomRecognize for these, which never made any sense. This also provides a path to lifting the immarg restriction on llvm.memcpy.inline. There seems to be a bug where TLI reports functions as available if you use -march and not -mtriple.
2025-01-07 08:34:59 +00:00 · 2023-06-07 09:03:17 -04:00 · 2023-06-07 09:03:17 -04:00 · 3c848194f2
commit 3c848194f2
parent 2e16df352c
25 changed files with 283 additions and 296 deletions
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@ -268,14 +268,12 @@ Changes to the C API

  * ``LLVMConstSelect``

-Changes to the FastISel infrastructure
--------------------------------------
-
-* ...
-
-Changes to the DAG infrastructure
---------------------------------
+Changes to the CodeGen infrastructure
+-------------------------------------

+* ``llvm.memcpy``, ``llvm.memmove`` and ``llvm.memset`` are now
+  expanded into loops by default for targets which do not report the
+  corresponding library function is available.

 Changes to the Metadata Info
 ---------------------------------
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -356,6 +356,10 @@ public:
  /// source/destination type and alignment and the number of bytes copied.
  InstructionCost getMemcpyCost(const Instruction *I) const;

+  /// Returns the maximum memset / memcpy size in bytes that still makes it
+  /// profitable to inline the call.
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const;
+
  /// \return The estimated number of case clusters when lowering \p 'SI'.
  /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
  /// table.
@ -1673,6 +1677,7 @@ public:
  virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
  virtual int getInlinerVectorBonusPercent() const = 0;
  virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
+  virtual uint64_t getMaxMemIntrinsicInlineSizeThreshold() const = 0;
  virtual unsigned
  getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
                                   ProfileSummaryInfo *PSI,
@ -2044,6 +2049,11 @@ public:
  InstructionCost getMemcpyCost(const Instruction *I) override {
    return Impl.getMemcpyCost(I);
  }
+
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
+    return Impl.getMaxMemIntrinsicInlineSizeThreshold();
+  }
+
  InstructionCost getInstructionCost(const User *U,
                                     ArrayRef<const Value *> Operands,
                                     TargetCostKind CostKind) override {
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -77,6 +77,10 @@ public:
    return TTI::TCC_Expensive;
  }

+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+    return 64;
+  }
+
  // Although this default value is arbitrary, it is not random. It is assumed
  // that a condition that evaluates the same way by a higher percentage than
  // this is best represented as control flow. Therefore, the default value N
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -1035,6 +1035,10 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
  return Cost;
 }

+uint64_t TargetTransformInfo::getMaxMemIntrinsicInlineSizeThreshold() const {
+  return TTIImpl->getMaxMemIntrinsicInlineSizeThreshold();
+}
+
 InstructionCost TargetTransformInfo::getArithmeticReductionCost(
    unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
    TTI::TargetCostKind CostKind) const {
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass implements IR lowering for the llvm.load.relative and llvm.objc.*
-// intrinsics.
+// This pass implements IR lowering for the llvm.memcpy, llvm.memmove,
+// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics.
 //
 //===----------------------------------------------------------------------===//

 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@ -24,9 +26,44 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"

 using namespace llvm;

+/// Threshold to leave statically sized memory intrinsic calls. Calls of known
+/// size larger than this will be expanded by the pass. Calls of unknown or
+/// lower size will be left for expansion in codegen.
+static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
+    "mem-intrinsic-expand-size",
+    cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1),
+    cl::Hidden);
+
+namespace {
+
+struct PreISelIntrinsicLowering {
+  const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
+  const function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo;
+
+  /// If this is true, assume it's preferably to leave memory intrinsic calls
+  /// for replacement with a library call later. Otherwise this depends on
+  /// TargetLibraryInfo availability of the corresponding function.
+  const bool UseMemIntrinsicLibFunc;
+
+  explicit PreISelIntrinsicLowering(
+      function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
+      function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo_,
+      bool UseMemIntrinsicLibFunc_ = true)
+      : LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_),
+        UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
+
+  static bool shouldExpandMemIntrinsicWithSize(Value *Size,
+                                               const TargetTransformInfo &TTI);
+  bool expandMemIntrinsicUses(Function &F) const;
+  bool lowerIntrinsics(Module &M) const;
+};
+
+} // namespace
+
 static bool lowerLoadRelative(Function &F) {
  if (F.use_empty())
    return false;
@ -133,12 +170,100 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
  return true;
 }

-static bool lowerIntrinsics(Module &M) {
+// TODO: Should refine based on estimated number of accesses (e.g. does it
+// require splitting based on alignment)
+bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize(
+    Value *Size, const TargetTransformInfo &TTI) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
+  if (!CI)
+    return true;
+  uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences()
+                           ? MemIntrinsicExpandSizeThresholdOpt
+                           : TTI.getMaxMemIntrinsicInlineSizeThreshold();
+  uint64_t SizeVal = CI->getZExtValue();
+
+  // Treat a threshold of 0 as a special case to force expansion of all
+  // intrinsics, including size 0.
+  return SizeVal > Threshold || Threshold == 0;
+}
+
+// TODO: Handle atomic memcpy and memcpy.inline
+// TODO: Pass ScalarEvolution
+bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
+  Intrinsic::ID ID = F.getIntrinsicID();
+  bool Changed = false;
+
+  for (User *U : llvm::make_early_inc_range(F.users())) {
+    Instruction *Inst = cast<Instruction>(U);
+
+    switch (ID) {
+    case Intrinsic::memcpy: {
+      auto *Memcpy = cast<MemCpyInst>(Inst);
+      Function *ParentFunc = Memcpy->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            LookupLibInfo(*ParentFunc).has(LibFunc_memcpy))
+          break;
+
+        expandMemCpyAsLoop(Memcpy, TTI);
+        Changed = true;
+        Memcpy->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memmove: {
+      auto *Memmove = cast<MemMoveInst>(Inst);
+      Function *ParentFunc = Memmove->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            LookupLibInfo(*ParentFunc).has(LibFunc_memmove))
+          break;
+
+        expandMemMoveAsLoop(Memmove);
+        Changed = true;
+        Memmove->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memset: {
+      auto *Memset = cast<MemSetInst>(Inst);
+      Function *ParentFunc = Memset->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset))
+          break;
+
+        expandMemSetAsLoop(Memset);
+        Changed = true;
+        Memset->eraseFromParent();
+      }
+
+      break;
+    }
+    default:
+      llvm_unreachable("unhandled intrinsic");
+    }
+  }
+
+  return Changed;
+}
+
+bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
  bool Changed = false;
  for (Function &F : M) {
    switch (F.getIntrinsicID()) {
    default:
      break;
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::memset:
+      Changed |= expandMemIntrinsicUses(F);
+      break;
    case Intrinsic::load_relative:
      Changed |= lowerLoadRelative(F);
      break;
@ -230,7 +355,23 @@ public:

  PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}

-  bool runOnModule(Module &M) override { return lowerIntrinsics(M); }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    auto LookupTTI = [this](Function &F) -> TargetTransformInfo & {
+      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    };
+
+    auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+
+    PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+    return Lowering.lowerIntrinsics(M);
+  }
 };

 } // end anonymous namespace
@ -247,7 +388,18 @@ ModulePass *llvm::createPreISelIntrinsicLoweringPass() {

 PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
                                                    ModuleAnalysisManager &AM) {
-  if (!lowerIntrinsics(M))
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+  if (!Lowering.lowerIntrinsics(M))
    return PreservedAnalyses::all();
  else
    return PreservedAnalyses::none();
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@ -1088,8 +1088,8 @@ bool TargetPassConfig::addISelPasses() {
  if (TM->useEmulatedTLS())
    addPass(createLowerEmuTLSPass());

-  addPass(createPreISelIntrinsicLoweringPass());
  PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+  addPass(createPreISelIntrinsicLoweringPass());
  addPass(createExpandLargeDivRemPass());
  addPass(createExpandLargeFpConvertPass());
  addIRPasses();
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -90,10 +90,6 @@ FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy);
 void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
 extern char &AMDGPUAtomicOptimizerID;

-ModulePass *createAMDGPULowerIntrinsicsPass();
-void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
-extern char &AMDGPULowerIntrinsicsID;
-
 ModulePass *createAMDGPUCtorDtorLoweringLegacyPass();
 void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &);
 extern char &AMDGPUCtorDtorLoweringLegacyPassID;
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@ -1,144 +0,0 @@
-//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
-
-#define DEBUG_TYPE "amdgpu-lower-intrinsics"
-
-using namespace llvm;
-
-namespace {
-
-static int MaxStaticSize;
-
-static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
-  "amdgpu-mem-intrinsic-expand-size",
-  cl::desc("Set minimum mem intrinsic size to expand in IR"),
-  cl::location(MaxStaticSize),
-  cl::init(1024),
-  cl::Hidden);
-
-
-class AMDGPULowerIntrinsics : public ModulePass {
-public:
-  static char ID;
-
-  AMDGPULowerIntrinsics() : ModulePass(ID) {}
-
-  bool runOnModule(Module &M) override;
-  bool expandMemIntrinsicUses(Function &F);
-  StringRef getPassName() const override {
-    return "AMDGPU Lower Intrinsics";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-  }
-};
-
-}
-
-char AMDGPULowerIntrinsics::ID = 0;
-
-char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
-
-INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
-                false)
-
-// TODO: Should refine based on estimated number of accesses (e.g. does it
-// require splitting based on alignment)
-static bool shouldExpandOperationWithSize(Value *Size) {
-  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
-  return !CI || (CI->getSExtValue() > MaxStaticSize);
-}
-
-bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
-  Intrinsic::ID ID = F.getIntrinsicID();
-  bool Changed = false;
-
-  for (User *U : llvm::make_early_inc_range(F.users())) {
-    Instruction *Inst = cast<Instruction>(U);
-
-    switch (ID) {
-    case Intrinsic::memcpy: {
-      auto *Memcpy = cast<MemCpyInst>(Inst);
-      if (shouldExpandOperationWithSize(Memcpy->getLength())) {
-        Function *ParentFunc = Memcpy->getParent()->getParent();
-        const TargetTransformInfo &TTI =
-            getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
-        expandMemCpyAsLoop(Memcpy, TTI);
-        Changed = true;
-        Memcpy->eraseFromParent();
-      }
-
-      break;
-    }
-    case Intrinsic::memmove: {
-      auto *Memmove = cast<MemMoveInst>(Inst);
-      if (shouldExpandOperationWithSize(Memmove->getLength())) {
-        expandMemMoveAsLoop(Memmove);
-        Changed = true;
-        Memmove->eraseFromParent();
-      }
-
-      break;
-    }
-    case Intrinsic::memset: {
-      auto *Memset = cast<MemSetInst>(Inst);
-      if (shouldExpandOperationWithSize(Memset->getLength())) {
-        expandMemSetAsLoop(Memset);
-        Changed = true;
-        Memset->eraseFromParent();
-      }
-
-      break;
-    }
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
-  bool Changed = false;
-
-  for (Function &F : M) {
-    if (!F.isDeclaration())
-      continue;
-
-    switch (F.getIntrinsicID()) {
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove:
-    case Intrinsic::memset:
-      if (expandMemIntrinsicUses(F))
-        Changed = true;
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
-  return new AMDGPULowerIntrinsics();
-}
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -388,7 +388,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
  initializeAMDGPULowerKernelArgumentsPass(*PR);
  initializeAMDGPUPromoteKernelArgumentsPass(*PR);
  initializeAMDGPULowerKernelAttributesPass(*PR);
-  initializeAMDGPULowerIntrinsicsPass(*PR);
  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
  initializeAMDGPUPostLegalizerCombinerPass(*PR);
  initializeAMDGPUPreLegalizerCombinerPass(*PR);
@ -998,8 +997,6 @@ void AMDGPUPassConfig::addIRPasses() {
  // A call to propagate attributes pass in the backend in case opt was not run.
  addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));

-  addPass(createAMDGPULowerIntrinsicsPass());
-
  // Function calls are not supported, so make sure we inline everything.
  addPass(createAMDGPUAlwaysInlinePass());
  addPass(createAlwaysInlinerLegacyPass());
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -267,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  BaseT::getPeelingPreferences(L, SE, PP);
 }

+int64_t AMDGPUTTIImpl::getMaxInlineSizeThreshold() const {
+  return 1024;
+}
+
 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
    // Codegen control options which don't matter.
    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
@ -395,6 +399,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }

+int64_t GCNTTIImpl::getMaxInlineSizeThreshold() const {
+  return 1024;
+}
+
 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
 // iteration. Should we report a larger size and let it legalize?
 //
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@ -55,6 +55,8 @@ public:

  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                             TTI::PeelingPreferences &PP);
+
+  int64_t getMaxInlineSizeThreshold() const;
 };

 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
@ -132,6 +134,8 @@ public:
                                   unsigned AddrSpace) const;
  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                    unsigned AddrSpace) const;
+
+  int64_t getMaxInlineSizeThreshold() const;
  Type *getMemcpyLoopLoweringType(
      LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -67,7 +67,6 @@ add_llvm_target(AMDGPUCodeGen
  AMDGPULegalizerInfo.cpp
  AMDGPULibCalls.cpp
  AMDGPULibFunc.cpp
-  AMDGPULowerIntrinsics.cpp
  AMDGPULowerKernelArguments.cpp
  AMDGPULowerKernelAttributes.cpp
  AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@ -210,6 +210,10 @@ public:

  InstructionCost getMemcpyCost(const Instruction *I);

+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+    return ST->getMaxInlineSizeThreshold();
+  }
+
  int getNumMemOps(const IntrinsicInst *I) const;

  InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@ -273,6 +273,11 @@ public:
                           const Function *Callee) const;
  bool areTypesABICompatible(const Function *Caller, const Function *Callee,
                             const ArrayRef<Type *> &Type) const;
+
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+    return ST->getMaxInlineSizeThreshold();
+  }
+
  TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                    bool IsZeroCmp) const;
  bool prefersVectorizedAddressing() const;
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@ -59,62 +59,40 @@ define amdgpu_kernel void @kernel_caller_stack() {

 define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-LABEL: kernel_caller_byval:
-; MUBUF:       ; %bb.0:
+; MUBUF:       ; %bb.0: ; %loadstoreloop.preheader
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
+; MUBUF-NEXT:    s_mov_b32 s5, 0
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_movk_i32 s4, 0x80
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:20
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:24
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:28
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:32
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:36
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:40
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:44
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:48
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:52
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:56
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:60
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:72
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:76
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:80
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:84
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:88
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:92
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:96
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:100
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:104
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:108
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:112
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:132
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
-; MUBUF-NEXT:    s_nop 0
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
-; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:16
-; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:20
-; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:24
-; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:28
-; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:32
-; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:36
-; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:40
-; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:44
-; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:48
-; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:52
-; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:56
-; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:60
-; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:64
-; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:68
+; MUBUF-NEXT:    v_mov_b32_e32 v1, s5
 ; MUBUF-NEXT:    s_movk_i32 s32, 0x1400
+; MUBUF-NEXT:  .LBB1_1: ; %loadstoreloop
+; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
+; MUBUF-NEXT:    v_add_u32_e32 v2, 4, v1
+; MUBUF-NEXT:    v_add_u32_e32 v1, 1, v1
+; MUBUF-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v1
+; MUBUF-NEXT:    buffer_store_byte v0, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    s_cbranch_vccnz .LBB1_1
+; MUBUF-NEXT:  ; %bb.2: ; %split
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:12
+; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:16
+; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:20
+; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:24
+; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:28
+; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:32
+; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:36
+; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:40
+; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:44
+; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:48
+; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:52
+; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:56
+; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:60
+; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:64
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
@ -154,38 +132,31 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-NEXT:    s_endpgm
 ;
 ; FLATSCR-LABEL: kernel_caller_byval:
-; FLATSCR:       ; %bb.0:
+; FLATSCR:       ; %bb.0: ; %loadstoreloop.preheader
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:8
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:24
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:32
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:40
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:48
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:56
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:64
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:72
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:80
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:88
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:96
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:104
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:112
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:120
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:128
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8
-; FLATSCR-NEXT:    s_nop 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:16
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:24
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:32
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:40
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:48
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:56
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:64
+; FLATSCR-NEXT:    s_mov_b32 s1, 0
+; FLATSCR-NEXT:    s_movk_i32 s0, 0x80
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
 ; FLATSCR-NEXT:    s_movk_i32 s32, 0x50
+; FLATSCR-NEXT:  .LBB1_1: ; %loadstoreloop
+; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLATSCR-NEXT:    v_add_u32_e32 v2, 4, v1
+; FLATSCR-NEXT:    v_add_u32_e32 v1, 1, v1
+; FLATSCR-NEXT:    v_cmp_gt_u32_e32 vcc, s0, v1
+; FLATSCR-NEXT:    scratch_store_byte v2, v0, off
+; FLATSCR-NEXT:    s_cbranch_vccnz .LBB1_1
+; FLATSCR-NEXT:  ; %bb.2: ; %split
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:4
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:12
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:20
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:28
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:36
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:44
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:52
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:60
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s

 declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s

 declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s

 declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s

 declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)

--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@ -34,7 +34,6 @@
 ; GCN-O0-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O0-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O0-NEXT:    AMDGPU Inline All Functions
 ; GCN-O0-NEXT:    Inliner for always_inline functions
 ; GCN-O0-NEXT:      FunctionPass Manager
@ -182,7 +181,6 @@
 ; GCN-O1-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O1-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O1-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-NEXT:    Inliner for always_inline functions
 ; GCN-O1-NEXT:      FunctionPass Manager
@ -458,7 +456,6 @@
 ; GCN-O1-OPTS-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O1-OPTS-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O1-OPTS-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-OPTS-NEXT:    Inliner for always_inline functions
 ; GCN-O1-OPTS-NEXT:      FunctionPass Manager
@ -766,7 +763,6 @@
 ; GCN-O2-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O2-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O2-NEXT:    AMDGPU Inline All Functions
 ; GCN-O2-NEXT:    Inliner for always_inline functions
 ; GCN-O2-NEXT:      FunctionPass Manager
@ -1077,7 +1073,6 @@
 ; GCN-O3-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O3-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O3-NEXT:    AMDGPU Inline All Functions
 ; GCN-O3-NEXT:    Inliner for always_inline functions
 ; GCN-O3-NEXT:      FunctionPass Manager
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s

-; Test the -amdgpu-mem-intrinsic-expand-size flag works.
+; Test the -mem-intrinsic-expand-size flag works.

 ; Make sure we can always eliminate the intrinsic, even at 0.
 define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
@ -17,19 +17,19 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
 ; OPT4-NEXT:    ret void
 ;
 ; OPT0-LABEL: @memset_size_0(
-; OPT0-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
+; OPT0-NEXT:    br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT0:       loadstoreloop:
+; OPT0-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
+; OPT0-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
+; OPT0-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
+; OPT0-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
+; OPT0-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT0:       split:
 ; OPT0-NEXT:    ret void
 ;
 ; OPT_NEG-LABEL: @memset_size_0(
-; OPT_NEG-NEXT:    br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG:       loadstoreloop:
-; OPT_NEG-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
-; OPT_NEG-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG:       split:
+; OPT_NEG-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
 ; OPT_NEG-NEXT:    ret void
 ;
  call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 0, i1 false)
@ -58,15 +58,7 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) {
 ; OPT0-NEXT:    ret void
 ;
 ; OPT_NEG-LABEL: @memset_size_4(
-; OPT_NEG-NEXT:    br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG:       loadstoreloop:
-; OPT_NEG-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
-; OPT_NEG-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG:       split:
+; OPT_NEG-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
 ; OPT_NEG-NEXT:    ret void
 ;
  call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 4, i1 false)
@ -103,15 +95,7 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) {
 ; OPT0-NEXT:    ret void
 ;
 ; OPT_NEG-LABEL: @memset_size_8(
-; OPT_NEG-NEXT:    br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG:       loadstoreloop:
-; OPT_NEG-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
-; OPT_NEG-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG:       split:
+; OPT_NEG-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
 ; OPT_NEG-NEXT:    ret void
 ;
  call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 8, i1 false)
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s

 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
 declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -S -passes=always-inline -o %t.bc %s
+; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK

 ; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
 ; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
--- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
@ -1,5 +1,5 @@
-; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-- < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s

 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1

--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@ -153,7 +153,6 @@ static_library("LLVMAMDGPUCodeGen") {
    "AMDGPULegalizerInfo.cpp",
    "AMDGPULibCalls.cpp",
    "AMDGPULibFunc.cpp",
-    "AMDGPULowerIntrinsics.cpp",
    "AMDGPULowerKernelArguments.cpp",
    "AMDGPULowerKernelAttributes.cpp",
    "AMDGPULowerModuleLDSPass.cpp",