From 3c848194f28decca41b7362f9dd35d4939797724 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 7 Jun 2023 09:03:17 -0400 Subject: [PATCH] CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering Expand large or unknown size memory intrinsics into loops in the default lowering pipeline if the target doesn't have the corresponding libfunc. Previously AMDGPU had a custom pass which existed to call the expansion utilities. With a default no-libcall option, we can remove the libfunc checks in LoopIdiomRecognize for these, which never made any sense. This also provides a path to lifting the immarg restriction on llvm.memcpy.inline. There seems to be a bug where TLI reports functions as available if you use -march and not -mtriple. --- llvm/docs/ReleaseNotes.rst | 12 +- .../llvm/Analysis/TargetTransformInfo.h | 10 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 4 + llvm/lib/Analysis/TargetTransformInfo.cpp | 4 + llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 162 +++++++++++++++++- llvm/lib/CodeGen/TargetPassConfig.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 - .../Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 144 ---------------- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 - .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 8 + .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 4 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 4 + llvm/lib/Target/X86/X86TargetTransformInfo.h | 5 + .../GlobalISel/call-outgoing-stack-args.ll | 129 ++++++-------- .../AMDGPU/GlobalISel/llvm.memcpy.inline.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/llvm.memset.ll | 4 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 - .../AMDGPU/lower-mem-intrinsics-threshold.ll | 50 ++---- .../CodeGen/AMDGPU/lower-mem-intrinsics.ll | 4 +- .../AMDGPU/schedule-regpressure-lds.ll | 3 +- .../CodeGen/AMDGPU/stack-size-overflow.ll | 4 +- .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 - 25 files changed, 283 insertions(+), 296 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 6176439deb16..4dd483f68544 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -268,14 +268,12 @@ Changes to the C API * ``LLVMConstSelect`` -Changes to the FastISel infrastructure --------------------------------------- - -* ... - -Changes to the DAG infrastructure ---------------------------------- +Changes to the CodeGen infrastructure +------------------------------------- +* ``llvm.memcpy``, ``llvm.memmove`` and ``llvm.memset`` are now + expanded into loops by default for targets which do not report the + corresponding library function is available. Changes to the Metadata Info --------------------------------- diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0e10162a6435..2a5953f3e0b1 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -356,6 +356,10 @@ public: /// source/destination type and alignment and the number of bytes copied. InstructionCost getMemcpyCost(const Instruction *I) const; + /// Returns the maximum memset / memcpy size in bytes that still makes it + /// profitable to inline the call. + uint64_t getMaxMemIntrinsicInlineSizeThreshold() const; + /// \return The estimated number of case clusters when lowering \p 'SI'. /// \p JTSize Set a jump table size only when \p SI is suitable for a jump /// table. @@ -1673,6 +1677,7 @@ public: virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; virtual int getInlinerVectorBonusPercent() const = 0; virtual InstructionCost getMemcpyCost(const Instruction *I) = 0; + virtual uint64_t getMaxMemIntrinsicInlineSizeThreshold() const = 0; virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, @@ -2044,6 +2049,11 @@ public: InstructionCost getMemcpyCost(const Instruction *I) override { return Impl.getMemcpyCost(I); } + + uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override { + return Impl.getMaxMemIntrinsicInlineSizeThreshold(); + } + InstructionCost getInstructionCost(const User *U, ArrayRef Operands, TargetCostKind CostKind) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index d7b1538d640e..de94e33f1ad0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -77,6 +77,10 @@ public: return TTI::TCC_Expensive; } + uint64_t getMaxMemIntrinsicInlineSizeThreshold() const { + return 64; + } + // Although this default value is arbitrary, it is not random. It is assumed // that a condition that evaluates the same way by a higher percentage than // this is best represented as control flow. Therefore, the default value N diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index a7b89b2df452..e1bb963fd465 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1035,6 +1035,10 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const { return Cost; } +uint64_t TargetTransformInfo::getMaxMemIntrinsicInlineSizeThreshold() const { + return TTIImpl->getMaxMemIntrinsicInlineSizeThreshold(); +} + InstructionCost TargetTransformInfo::getArithmeticReductionCost( unsigned Opcode, VectorType *Ty, std::optional FMF, TTI::TargetCostKind CostKind) const { diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 228557b7a74c..0bdb6b59d3ac 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -6,14 +6,16 @@ // //===----------------------------------------------------------------------===// // -// This pass implements IR lowering for the llvm.load.relative and llvm.objc.* -// intrinsics. +// This pass implements IR lowering for the llvm.memcpy, llvm.memmove, +// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/Analysis/ObjCARCUtil.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -24,9 +26,44 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" using namespace llvm; +/// Threshold to leave statically sized memory intrinsic calls. Calls of known +/// size larger than this will be expanded by the pass. Calls of unknown or +/// lower size will be left for expansion in codegen. +static cl::opt MemIntrinsicExpandSizeThresholdOpt( + "mem-intrinsic-expand-size", + cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1), + cl::Hidden); + +namespace { + +struct PreISelIntrinsicLowering { + const function_ref LookupTTI; + const function_ref LookupLibInfo; + + /// If this is true, assume it's preferably to leave memory intrinsic calls + /// for replacement with a library call later. Otherwise this depends on + /// TargetLibraryInfo availability of the corresponding function. + const bool UseMemIntrinsicLibFunc; + + explicit PreISelIntrinsicLowering( + function_ref LookupTTI_, + function_ref LookupLibInfo_, + bool UseMemIntrinsicLibFunc_ = true) + : LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_), + UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {} + + static bool shouldExpandMemIntrinsicWithSize(Value *Size, + const TargetTransformInfo &TTI); + bool expandMemIntrinsicUses(Function &F) const; + bool lowerIntrinsics(Module &M) const; +}; + +} // namespace + static bool lowerLoadRelative(Function &F) { if (F.use_empty()) return false; @@ -133,12 +170,100 @@ static bool lowerObjCCall(Function &F, const char *NewFn, return true; } -static bool lowerIntrinsics(Module &M) { +// TODO: Should refine based on estimated number of accesses (e.g. does it +// require splitting based on alignment) +bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize( + Value *Size, const TargetTransformInfo &TTI) { + ConstantInt *CI = dyn_cast(Size); + if (!CI) + return true; + uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences() + ? MemIntrinsicExpandSizeThresholdOpt + : TTI.getMaxMemIntrinsicInlineSizeThreshold(); + uint64_t SizeVal = CI->getZExtValue(); + + // Treat a threshold of 0 as a special case to force expansion of all + // intrinsics, including size 0. + return SizeVal > Threshold || Threshold == 0; +} + +// TODO: Handle atomic memcpy and memcpy.inline +// TODO: Pass ScalarEvolution +bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { + Intrinsic::ID ID = F.getIntrinsicID(); + bool Changed = false; + + for (User *U : llvm::make_early_inc_range(F.users())) { + Instruction *Inst = cast(U); + + switch (ID) { + case Intrinsic::memcpy: { + auto *Memcpy = cast(Inst); + Function *ParentFunc = Memcpy->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) { + if (UseMemIntrinsicLibFunc && + LookupLibInfo(*ParentFunc).has(LibFunc_memcpy)) + break; + + expandMemCpyAsLoop(Memcpy, TTI); + Changed = true; + Memcpy->eraseFromParent(); + } + + break; + } + case Intrinsic::memmove: { + auto *Memmove = cast(Inst); + Function *ParentFunc = Memmove->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) { + if (UseMemIntrinsicLibFunc && + LookupLibInfo(*ParentFunc).has(LibFunc_memmove)) + break; + + expandMemMoveAsLoop(Memmove); + Changed = true; + Memmove->eraseFromParent(); + } + + break; + } + case Intrinsic::memset: { + auto *Memset = cast(Inst); + Function *ParentFunc = Memset->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) { + if (UseMemIntrinsicLibFunc && + LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset)) + break; + + expandMemSetAsLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + } + + break; + } + default: + llvm_unreachable("unhandled intrinsic"); + } + } + + return Changed; +} + +bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { bool Changed = false; for (Function &F : M) { switch (F.getIntrinsicID()) { default: break; + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + Changed |= expandMemIntrinsicUses(F); + break; case Intrinsic::load_relative: Changed |= lowerLoadRelative(F); break; @@ -230,7 +355,23 @@ public: PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {} - bool runOnModule(Module &M) override { return lowerIntrinsics(M); } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + + bool runOnModule(Module &M) override { + auto LookupTTI = [this](Function &F) -> TargetTransformInfo & { + return this->getAnalysis().getTTI(F); + }; + + auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; + + PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI); + return Lowering.lowerIntrinsics(M); + } }; } // end anonymous namespace @@ -247,7 +388,18 @@ ModulePass *llvm::createPreISelIntrinsicLoweringPass() { PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M, ModuleAnalysisManager &AM) { - if (!lowerIntrinsics(M)) + auto &FAM = AM.getResult(M).getManager(); + + auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; + + auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + + PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI); + if (!Lowering.lowerIntrinsics(M)) return PreservedAnalyses::all(); else return PreservedAnalyses::none(); diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 86490c0d6417..8ece4c764f61 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1088,8 +1088,8 @@ bool TargetPassConfig::addISelPasses() { if (TM->useEmulatedTLS()) addPass(createLowerEmuTLSPass()); - addPass(createPreISelIntrinsicLoweringPass()); PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); + addPass(createPreISelIntrinsicLoweringPass()); addPass(createExpandLargeDivRemPass()); addPass(createExpandLargeFpConvertPass()); addIRPasses(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 887aca9c2c09..3e15fc07c71d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -90,10 +90,6 @@ FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy); void initializeAMDGPUAtomicOptimizerPass(PassRegistry &); extern char &AMDGPUAtomicOptimizerID; -ModulePass *createAMDGPULowerIntrinsicsPass(); -void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); -extern char &AMDGPULowerIntrinsicsID; - ModulePass *createAMDGPUCtorDtorLoweringLegacyPass(); void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &); extern char &AMDGPUCtorDtorLoweringLegacyPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp deleted file mode 100644 index f9b21e07ed6a..000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ /dev/null @@ -1,144 +0,0 @@ -//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsR600.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" - -#define DEBUG_TYPE "amdgpu-lower-intrinsics" - -using namespace llvm; - -namespace { - -static int MaxStaticSize; - -static cl::opt MemIntrinsicExpandSizeThresholdOpt( - "amdgpu-mem-intrinsic-expand-size", - cl::desc("Set minimum mem intrinsic size to expand in IR"), - cl::location(MaxStaticSize), - cl::init(1024), - cl::Hidden); - - -class AMDGPULowerIntrinsics : public ModulePass { -public: - static char ID; - - AMDGPULowerIntrinsics() : ModulePass(ID) {} - - bool runOnModule(Module &M) override; - bool expandMemIntrinsicUses(Function &F); - StringRef getPassName() const override { - return "AMDGPU Lower Intrinsics"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - } -}; - -} - -char AMDGPULowerIntrinsics::ID = 0; - -char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; - -INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false, - false) - -// TODO: Should refine based on estimated number of accesses (e.g. does it -// require splitting based on alignment) -static bool shouldExpandOperationWithSize(Value *Size) { - ConstantInt *CI = dyn_cast(Size); - return !CI || (CI->getSExtValue() > MaxStaticSize); -} - -bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { - Intrinsic::ID ID = F.getIntrinsicID(); - bool Changed = false; - - for (User *U : llvm::make_early_inc_range(F.users())) { - Instruction *Inst = cast(U); - - switch (ID) { - case Intrinsic::memcpy: { - auto *Memcpy = cast(Inst); - if (shouldExpandOperationWithSize(Memcpy->getLength())) { - Function *ParentFunc = Memcpy->getParent()->getParent(); - const TargetTransformInfo &TTI = - getAnalysis().getTTI(*ParentFunc); - expandMemCpyAsLoop(Memcpy, TTI); - Changed = true; - Memcpy->eraseFromParent(); - } - - break; - } - case Intrinsic::memmove: { - auto *Memmove = cast(Inst); - if (shouldExpandOperationWithSize(Memmove->getLength())) { - expandMemMoveAsLoop(Memmove); - Changed = true; - Memmove->eraseFromParent(); - } - - break; - } - case Intrinsic::memset: { - auto *Memset = cast(Inst); - if (shouldExpandOperationWithSize(Memset->getLength())) { - expandMemSetAsLoop(Memset); - Changed = true; - Memset->eraseFromParent(); - } - - break; - } - default: - break; - } - } - - return Changed; -} - -bool AMDGPULowerIntrinsics::runOnModule(Module &M) { - bool Changed = false; - - for (Function &F : M) { - if (!F.isDeclaration()) - continue; - - switch (F.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: - if (expandMemIntrinsicUses(F)) - Changed = true; - break; - default: - break; - } - } - - return Changed; -} - -ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { - return new AMDGPULowerIntrinsics(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 26ab00ab2983..15373d0d2b58 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -388,7 +388,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); - initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); @@ -998,8 +997,6 @@ void AMDGPUPassConfig::addIRPasses() { // A call to propagate attributes pass in the backend in case opt was not run. addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); - addPass(createAMDGPULowerIntrinsicsPass()); - // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c2fd67790d9b..5a9e87deecc1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -267,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } +int64_t AMDGPUTTIImpl::getMaxInlineSizeThreshold() const { + return 1024; +} + const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { // Codegen control options which don't matter. AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, @@ -395,6 +399,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } +int64_t GCNTTIImpl::getMaxInlineSizeThreshold() const { + return 1024; +} + // FIXME: Really we would like to issue multiple 128-bit loads and stores per // iteration. Should we report a larger size and let it legalize? // diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 630804f169bf..27fb65154fc6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -55,6 +55,8 @@ public: void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); + + int64_t getMaxInlineSizeThreshold() const; }; class GCNTTIImpl final : public BasicTTIImplBase { @@ -132,6 +134,8 @@ public: unsigned AddrSpace) const; bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; + + int64_t getMaxInlineSizeThreshold() const; Type *getMemcpyLoopLoweringType( LLVMContext & Context, Value * Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 02a6d1f01333..8df156f24dcb 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -67,7 +67,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPULegalizerInfo.cpp AMDGPULibCalls.cpp AMDGPULibFunc.cpp - AMDGPULowerIntrinsics.cpp AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 1453450656ad..f8dae8e5041a 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -210,6 +210,10 @@ public: InstructionCost getMemcpyCost(const Instruction *I); + uint64_t getMaxMemIntrinsicInlineSizeThreshold() const { + return ST->getMaxInlineSizeThreshold(); + } + int getNumMemOps(const IntrinsicInst *I) const; InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 2c5105d2f03f..857d95eb6583 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -273,6 +273,11 @@ public: const Function *Callee) const; bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef &Type) const; + + uint64_t getMaxMemIntrinsicInlineSizeThreshold() const { + return ST->getMaxInlineSizeThreshold(); + } + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; bool prefersVectorizedAddressing() const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 48c1ef46c197..c7d82551530f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -59,62 +59,40 @@ define amdgpu_kernel void @kernel_caller_stack() { define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: -; MUBUF: ; %bb.0: +; MUBUF: ; %bb.0: ; %loadstoreloop.preheader ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_mov_b32 s5, 0 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_movk_i32 s4, 0x80 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 -; MUBUF-NEXT: s_nop 0 -; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 -; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16 -; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20 -; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24 -; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28 -; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32 -; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36 -; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40 -; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44 -; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48 -; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52 -; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56 -; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60 -; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64 -; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68 +; MUBUF-NEXT: v_mov_b32_e32 v1, s5 ; MUBUF-NEXT: s_movk_i32 s32, 0x1400 +; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop +; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; MUBUF-NEXT: v_add_u32_e32 v2, 4, v1 +; MUBUF-NEXT: v_add_u32_e32 v1, 1, v1 +; MUBUF-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1 +; MUBUF-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen +; MUBUF-NEXT: s_cbranch_vccnz .LBB1_1 +; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 +; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12 +; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 +; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:20 +; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:24 +; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:28 +; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:32 +; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:36 +; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:40 +; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:44 +; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:48 +; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:52 +; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:56 +; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:60 +; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:64 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 @@ -154,38 +132,31 @@ define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: kernel_caller_byval: -; FLATSCR: ; %bb.0: +; FLATSCR: ; %bb.0: ; %loadstoreloop.preheader ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 -; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:32 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:40 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:48 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:56 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:64 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:72 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:80 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:96 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:128 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 -; FLATSCR-NEXT: s_nop 0 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:16 -; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:24 -; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:32 -; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:40 -; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:48 -; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:56 -; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:64 +; FLATSCR-NEXT: s_mov_b32 s1, 0 +; FLATSCR-NEXT: s_movk_i32 s0, 0x80 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 ; FLATSCR-NEXT: s_movk_i32 s32, 0x50 +; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop +; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; FLATSCR-NEXT: v_add_u32_e32 v2, 4, v1 +; FLATSCR-NEXT: v_add_u32_e32 v1, 1, v1 +; FLATSCR-NEXT: v_cmp_gt_u32_e32 vcc, s0, v1 +; FLATSCR-NEXT: scratch_store_byte v2, v0, off +; FLATSCR-NEXT: s_cbranch_vccnz .LBB1_1 +; FLATSCR-NEXT: ; %bb.2: ; %split +; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:12 +; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:20 +; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:28 +; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:36 +; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:44 +; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:52 +; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:60 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll index 49502b343560..bf956c3ca823 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index bb7770701a63..6b0545561351 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll index c7a1e163c04a..466147cac343 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll index dd3f9f12111d..7cd3babc7090 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s +; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 20282ff2992b..36e1476f7de8 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -34,7 +34,6 @@ ; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Early propagate attributes from kernels to functions -; GCN-O0-NEXT: AMDGPU Lower Intrinsics ; GCN-O0-NEXT: AMDGPU Inline All Functions ; GCN-O0-NEXT: Inliner for always_inline functions ; GCN-O0-NEXT: FunctionPass Manager @@ -182,7 +181,6 @@ ; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Early propagate attributes from kernels to functions -; GCN-O1-NEXT: AMDGPU Lower Intrinsics ; GCN-O1-NEXT: AMDGPU Inline All Functions ; GCN-O1-NEXT: Inliner for always_inline functions ; GCN-O1-NEXT: FunctionPass Manager @@ -458,7 +456,6 @@ ; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions -; GCN-O1-OPTS-NEXT: AMDGPU Lower Intrinsics ; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions ; GCN-O1-OPTS-NEXT: Inliner for always_inline functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -766,7 +763,6 @@ ; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Early propagate attributes from kernels to functions -; GCN-O2-NEXT: AMDGPU Lower Intrinsics ; GCN-O2-NEXT: AMDGPU Inline All Functions ; GCN-O2-NEXT: Inliner for always_inline functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1077,7 +1073,6 @@ ; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Early propagate attributes from kernels to functions -; GCN-O3-NEXT: AMDGPU Lower Intrinsics ; GCN-O3-NEXT: AMDGPU Inline All Functions ; GCN-O3-NEXT: Inliner for always_inline functions ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll index cd720e93a48f..e9d42dc70cbb 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s -; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s -; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s -; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s +; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s +; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s +; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s +; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s -; Test the -amdgpu-mem-intrinsic-expand-size flag works. +; Test the -mem-intrinsic-expand-size flag works. ; Make sure we can always eliminate the intrinsic, even at 0. define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) { @@ -17,19 +17,19 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) { ; OPT4-NEXT: ret void ; ; OPT0-LABEL: @memset_size_0( -; OPT0-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false) +; OPT0-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] +; OPT0: loadstoreloop: +; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] +; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] +; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 +; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0 +; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] +; OPT0: split: ; OPT0-NEXT: ret void ; ; OPT_NEG-LABEL: @memset_size_0( -; OPT_NEG-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT_NEG: loadstoreloop: -; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] -; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0 -; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT_NEG: split: +; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false) ; OPT_NEG-NEXT: ret void ; call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 0, i1 false) @@ -58,15 +58,7 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) { ; OPT0-NEXT: ret void ; ; OPT_NEG-LABEL: @memset_size_4( -; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT_NEG: loadstoreloop: -; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] -; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4 -; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT_NEG: split: +; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false) ; OPT_NEG-NEXT: ret void ; call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 4, i1 false) @@ -103,15 +95,7 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) { ; OPT0-NEXT: ret void ; ; OPT_NEG-LABEL: @memset_size_8( -; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] -; OPT_NEG: loadstoreloop: -; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] -; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] -; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 -; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 -; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8 -; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] -; OPT_NEG: split: +; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false) ; OPT_NEG-NEXT: ret void ; call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 8, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index d9891228e6e2..78280a971c35 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1 declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll index 1ca3e8f67eab..48115f9e405c 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK +; RUN: opt -S -passes=always-inline -o %t.bc %s +; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK ; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully ; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs, diff --git a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll index 6969811f672d..b1a939d7aa99 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll @@ -1,5 +1,5 @@ -; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: not llc -mtriple=amdgcn-- < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; RUN: not llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1 diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 14cd3a5880e7..36af931b713a 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -153,7 +153,6 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPULegalizerInfo.cpp", "AMDGPULibCalls.cpp", "AMDGPULibFunc.cpp", - "AMDGPULowerIntrinsics.cpp", "AMDGPULowerKernelArguments.cpp", "AMDGPULowerKernelAttributes.cpp", "AMDGPULowerModuleLDSPass.cpp",