mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-01-07 08:34:59 +00:00
CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering
Expand large or unknown size memory intrinsics into loops in the default lowering pipeline if the target doesn't have the corresponding libfunc. Previously AMDGPU had a custom pass which existed to call the expansion utilities. With a default no-libcall option, we can remove the libfunc checks in LoopIdiomRecognize for these, which never made any sense. This also provides a path to lifting the immarg restriction on llvm.memcpy.inline. There seems to be a bug where TLI reports functions as available if you use -march and not -mtriple.
This commit is contained in:
parent
2e16df352c
commit
3c848194f2
@ -268,14 +268,12 @@ Changes to the C API
|
||||
|
||||
* ``LLVMConstSelect``
|
||||
|
||||
Changes to the FastISel infrastructure
|
||||
--------------------------------------
|
||||
|
||||
* ...
|
||||
|
||||
Changes to the DAG infrastructure
|
||||
---------------------------------
|
||||
Changes to the CodeGen infrastructure
|
||||
-------------------------------------
|
||||
|
||||
* ``llvm.memcpy``, ``llvm.memmove`` and ``llvm.memset`` are now
|
||||
expanded into loops by default for targets which do not report the
|
||||
corresponding library function is available.
|
||||
|
||||
Changes to the Metadata Info
|
||||
---------------------------------
|
||||
|
@ -356,6 +356,10 @@ public:
|
||||
/// source/destination type and alignment and the number of bytes copied.
|
||||
InstructionCost getMemcpyCost(const Instruction *I) const;
|
||||
|
||||
/// Returns the maximum memset / memcpy size in bytes that still makes it
|
||||
/// profitable to inline the call.
|
||||
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const;
|
||||
|
||||
/// \return The estimated number of case clusters when lowering \p 'SI'.
|
||||
/// \p JTSize Set a jump table size only when \p SI is suitable for a jump
|
||||
/// table.
|
||||
@ -1673,6 +1677,7 @@ public:
|
||||
virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
|
||||
virtual int getInlinerVectorBonusPercent() const = 0;
|
||||
virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
|
||||
virtual uint64_t getMaxMemIntrinsicInlineSizeThreshold() const = 0;
|
||||
virtual unsigned
|
||||
getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
|
||||
ProfileSummaryInfo *PSI,
|
||||
@ -2044,6 +2049,11 @@ public:
|
||||
InstructionCost getMemcpyCost(const Instruction *I) override {
|
||||
return Impl.getMemcpyCost(I);
|
||||
}
|
||||
|
||||
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
|
||||
return Impl.getMaxMemIntrinsicInlineSizeThreshold();
|
||||
}
|
||||
|
||||
InstructionCost getInstructionCost(const User *U,
|
||||
ArrayRef<const Value *> Operands,
|
||||
TargetCostKind CostKind) override {
|
||||
|
@ -77,6 +77,10 @@ public:
|
||||
return TTI::TCC_Expensive;
|
||||
}
|
||||
|
||||
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
|
||||
return 64;
|
||||
}
|
||||
|
||||
// Although this default value is arbitrary, it is not random. It is assumed
|
||||
// that a condition that evaluates the same way by a higher percentage than
|
||||
// this is best represented as control flow. Therefore, the default value N
|
||||
|
@ -1035,6 +1035,10 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
|
||||
return Cost;
|
||||
}
|
||||
|
||||
uint64_t TargetTransformInfo::getMaxMemIntrinsicInlineSizeThreshold() const {
|
||||
return TTIImpl->getMaxMemIntrinsicInlineSizeThreshold();
|
||||
}
|
||||
|
||||
InstructionCost TargetTransformInfo::getArithmeticReductionCost(
|
||||
unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
|
||||
TTI::TargetCostKind CostKind) const {
|
||||
|
@ -6,14 +6,16 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This pass implements IR lowering for the llvm.load.relative and llvm.objc.*
|
||||
// intrinsics.
|
||||
// This pass implements IR lowering for the llvm.memcpy, llvm.memmove,
|
||||
// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
|
||||
#include "llvm/Analysis/ObjCARCInstKind.h"
|
||||
#include "llvm/Analysis/ObjCARCUtil.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
@ -24,9 +26,44 @@
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
/// Threshold to leave statically sized memory intrinsic calls. Calls of known
|
||||
/// size larger than this will be expanded by the pass. Calls of unknown or
|
||||
/// lower size will be left for expansion in codegen.
|
||||
static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
|
||||
"mem-intrinsic-expand-size",
|
||||
cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1),
|
||||
cl::Hidden);
|
||||
|
||||
namespace {
|
||||
|
||||
struct PreISelIntrinsicLowering {
|
||||
const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
|
||||
const function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo;
|
||||
|
||||
/// If this is true, assume it's preferably to leave memory intrinsic calls
|
||||
/// for replacement with a library call later. Otherwise this depends on
|
||||
/// TargetLibraryInfo availability of the corresponding function.
|
||||
const bool UseMemIntrinsicLibFunc;
|
||||
|
||||
explicit PreISelIntrinsicLowering(
|
||||
function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
|
||||
function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo_,
|
||||
bool UseMemIntrinsicLibFunc_ = true)
|
||||
: LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_),
|
||||
UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
|
||||
|
||||
static bool shouldExpandMemIntrinsicWithSize(Value *Size,
|
||||
const TargetTransformInfo &TTI);
|
||||
bool expandMemIntrinsicUses(Function &F) const;
|
||||
bool lowerIntrinsics(Module &M) const;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
static bool lowerLoadRelative(Function &F) {
|
||||
if (F.use_empty())
|
||||
return false;
|
||||
@ -133,12 +170,100 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool lowerIntrinsics(Module &M) {
|
||||
// TODO: Should refine based on estimated number of accesses (e.g. does it
|
||||
// require splitting based on alignment)
|
||||
bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize(
|
||||
Value *Size, const TargetTransformInfo &TTI) {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
|
||||
if (!CI)
|
||||
return true;
|
||||
uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences()
|
||||
? MemIntrinsicExpandSizeThresholdOpt
|
||||
: TTI.getMaxMemIntrinsicInlineSizeThreshold();
|
||||
uint64_t SizeVal = CI->getZExtValue();
|
||||
|
||||
// Treat a threshold of 0 as a special case to force expansion of all
|
||||
// intrinsics, including size 0.
|
||||
return SizeVal > Threshold || Threshold == 0;
|
||||
}
|
||||
|
||||
// TODO: Handle atomic memcpy and memcpy.inline
|
||||
// TODO: Pass ScalarEvolution
|
||||
bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
|
||||
Intrinsic::ID ID = F.getIntrinsicID();
|
||||
bool Changed = false;
|
||||
|
||||
for (User *U : llvm::make_early_inc_range(F.users())) {
|
||||
Instruction *Inst = cast<Instruction>(U);
|
||||
|
||||
switch (ID) {
|
||||
case Intrinsic::memcpy: {
|
||||
auto *Memcpy = cast<MemCpyInst>(Inst);
|
||||
Function *ParentFunc = Memcpy->getFunction();
|
||||
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
|
||||
if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
|
||||
if (UseMemIntrinsicLibFunc &&
|
||||
LookupLibInfo(*ParentFunc).has(LibFunc_memcpy))
|
||||
break;
|
||||
|
||||
expandMemCpyAsLoop(Memcpy, TTI);
|
||||
Changed = true;
|
||||
Memcpy->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::memmove: {
|
||||
auto *Memmove = cast<MemMoveInst>(Inst);
|
||||
Function *ParentFunc = Memmove->getFunction();
|
||||
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
|
||||
if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) {
|
||||
if (UseMemIntrinsicLibFunc &&
|
||||
LookupLibInfo(*ParentFunc).has(LibFunc_memmove))
|
||||
break;
|
||||
|
||||
expandMemMoveAsLoop(Memmove);
|
||||
Changed = true;
|
||||
Memmove->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::memset: {
|
||||
auto *Memset = cast<MemSetInst>(Inst);
|
||||
Function *ParentFunc = Memset->getFunction();
|
||||
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
|
||||
if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) {
|
||||
if (UseMemIntrinsicLibFunc &&
|
||||
LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset))
|
||||
break;
|
||||
|
||||
expandMemSetAsLoop(Memset);
|
||||
Changed = true;
|
||||
Memset->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("unhandled intrinsic");
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
|
||||
bool Changed = false;
|
||||
for (Function &F : M) {
|
||||
switch (F.getIntrinsicID()) {
|
||||
default:
|
||||
break;
|
||||
case Intrinsic::memcpy:
|
||||
case Intrinsic::memmove:
|
||||
case Intrinsic::memset:
|
||||
Changed |= expandMemIntrinsicUses(F);
|
||||
break;
|
||||
case Intrinsic::load_relative:
|
||||
Changed |= lowerLoadRelative(F);
|
||||
break;
|
||||
@ -230,7 +355,23 @@ public:
|
||||
|
||||
PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
|
||||
|
||||
bool runOnModule(Module &M) override { return lowerIntrinsics(M); }
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
||||
AU.addRequired<TargetTransformInfoWrapperPass>();
|
||||
}
|
||||
|
||||
bool runOnModule(Module &M) override {
|
||||
auto LookupTTI = [this](Function &F) -> TargetTransformInfo & {
|
||||
return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
||||
};
|
||||
|
||||
auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
|
||||
return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
|
||||
};
|
||||
|
||||
PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
|
||||
return Lowering.lowerIntrinsics(M);
|
||||
}
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
@ -247,7 +388,18 @@ ModulePass *llvm::createPreISelIntrinsicLoweringPass() {
|
||||
|
||||
PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
|
||||
ModuleAnalysisManager &AM) {
|
||||
if (!lowerIntrinsics(M))
|
||||
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
||||
|
||||
auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
|
||||
return FAM.getResult<TargetLibraryAnalysis>(F);
|
||||
};
|
||||
|
||||
auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
|
||||
return FAM.getResult<TargetIRAnalysis>(F);
|
||||
};
|
||||
|
||||
PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
|
||||
if (!Lowering.lowerIntrinsics(M))
|
||||
return PreservedAnalyses::all();
|
||||
else
|
||||
return PreservedAnalyses::none();
|
||||
|
@ -1088,8 +1088,8 @@ bool TargetPassConfig::addISelPasses() {
|
||||
if (TM->useEmulatedTLS())
|
||||
addPass(createLowerEmuTLSPass());
|
||||
|
||||
addPass(createPreISelIntrinsicLoweringPass());
|
||||
PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
|
||||
addPass(createPreISelIntrinsicLoweringPass());
|
||||
addPass(createExpandLargeDivRemPass());
|
||||
addPass(createExpandLargeFpConvertPass());
|
||||
addIRPasses();
|
||||
|
@ -90,10 +90,6 @@ FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy);
|
||||
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
|
||||
extern char &AMDGPUAtomicOptimizerID;
|
||||
|
||||
ModulePass *createAMDGPULowerIntrinsicsPass();
|
||||
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
|
||||
extern char &AMDGPULowerIntrinsicsID;
|
||||
|
||||
ModulePass *createAMDGPUCtorDtorLoweringLegacyPass();
|
||||
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &);
|
||||
extern char &AMDGPUCtorDtorLoweringLegacyPassID;
|
||||
|
@ -1,144 +0,0 @@
|
||||
//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/IntrinsicsR600.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
static int MaxStaticSize;
|
||||
|
||||
static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
|
||||
"amdgpu-mem-intrinsic-expand-size",
|
||||
cl::desc("Set minimum mem intrinsic size to expand in IR"),
|
||||
cl::location(MaxStaticSize),
|
||||
cl::init(1024),
|
||||
cl::Hidden);
|
||||
|
||||
|
||||
class AMDGPULowerIntrinsics : public ModulePass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
AMDGPULowerIntrinsics() : ModulePass(ID) {}
|
||||
|
||||
bool runOnModule(Module &M) override;
|
||||
bool expandMemIntrinsicUses(Function &F);
|
||||
StringRef getPassName() const override {
|
||||
return "AMDGPU Lower Intrinsics";
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetTransformInfoWrapperPass>();
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
char AMDGPULowerIntrinsics::ID = 0;
|
||||
|
||||
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
|
||||
|
||||
INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
|
||||
false)
|
||||
|
||||
// TODO: Should refine based on estimated number of accesses (e.g. does it
|
||||
// require splitting based on alignment)
|
||||
static bool shouldExpandOperationWithSize(Value *Size) {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
|
||||
return !CI || (CI->getSExtValue() > MaxStaticSize);
|
||||
}
|
||||
|
||||
bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
|
||||
Intrinsic::ID ID = F.getIntrinsicID();
|
||||
bool Changed = false;
|
||||
|
||||
for (User *U : llvm::make_early_inc_range(F.users())) {
|
||||
Instruction *Inst = cast<Instruction>(U);
|
||||
|
||||
switch (ID) {
|
||||
case Intrinsic::memcpy: {
|
||||
auto *Memcpy = cast<MemCpyInst>(Inst);
|
||||
if (shouldExpandOperationWithSize(Memcpy->getLength())) {
|
||||
Function *ParentFunc = Memcpy->getParent()->getParent();
|
||||
const TargetTransformInfo &TTI =
|
||||
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
|
||||
expandMemCpyAsLoop(Memcpy, TTI);
|
||||
Changed = true;
|
||||
Memcpy->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::memmove: {
|
||||
auto *Memmove = cast<MemMoveInst>(Inst);
|
||||
if (shouldExpandOperationWithSize(Memmove->getLength())) {
|
||||
expandMemMoveAsLoop(Memmove);
|
||||
Changed = true;
|
||||
Memmove->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::memset: {
|
||||
auto *Memset = cast<MemSetInst>(Inst);
|
||||
if (shouldExpandOperationWithSize(Memset->getLength())) {
|
||||
expandMemSetAsLoop(Memset);
|
||||
Changed = true;
|
||||
Memset->eraseFromParent();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
|
||||
bool Changed = false;
|
||||
|
||||
for (Function &F : M) {
|
||||
if (!F.isDeclaration())
|
||||
continue;
|
||||
|
||||
switch (F.getIntrinsicID()) {
|
||||
case Intrinsic::memcpy:
|
||||
case Intrinsic::memmove:
|
||||
case Intrinsic::memset:
|
||||
if (expandMemIntrinsicUses(F))
|
||||
Changed = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
|
||||
return new AMDGPULowerIntrinsics();
|
||||
}
|
@ -388,7 +388,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
||||
initializeAMDGPULowerKernelArgumentsPass(*PR);
|
||||
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
|
||||
initializeAMDGPULowerKernelAttributesPass(*PR);
|
||||
initializeAMDGPULowerIntrinsicsPass(*PR);
|
||||
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
|
||||
initializeAMDGPUPostLegalizerCombinerPass(*PR);
|
||||
initializeAMDGPUPreLegalizerCombinerPass(*PR);
|
||||
@ -998,8 +997,6 @@ void AMDGPUPassConfig::addIRPasses() {
|
||||
// A call to propagate attributes pass in the backend in case opt was not run.
|
||||
addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
|
||||
|
||||
addPass(createAMDGPULowerIntrinsicsPass());
|
||||
|
||||
// Function calls are not supported, so make sure we inline everything.
|
||||
addPass(createAMDGPUAlwaysInlinePass());
|
||||
addPass(createAlwaysInlinerLegacyPass());
|
||||
|
@ -267,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
|
||||
BaseT::getPeelingPreferences(L, SE, PP);
|
||||
}
|
||||
|
||||
int64_t AMDGPUTTIImpl::getMaxInlineSizeThreshold() const {
|
||||
return 1024;
|
||||
}
|
||||
|
||||
const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
|
||||
// Codegen control options which don't matter.
|
||||
AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
|
||||
@ -395,6 +399,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
|
||||
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
|
||||
}
|
||||
|
||||
int64_t GCNTTIImpl::getMaxInlineSizeThreshold() const {
|
||||
return 1024;
|
||||
}
|
||||
|
||||
// FIXME: Really we would like to issue multiple 128-bit loads and stores per
|
||||
// iteration. Should we report a larger size and let it legalize?
|
||||
//
|
||||
|
@ -55,6 +55,8 @@ public:
|
||||
|
||||
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
|
||||
TTI::PeelingPreferences &PP);
|
||||
|
||||
int64_t getMaxInlineSizeThreshold() const;
|
||||
};
|
||||
|
||||
class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
|
||||
@ -132,6 +134,8 @@ public:
|
||||
unsigned AddrSpace) const;
|
||||
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
|
||||
unsigned AddrSpace) const;
|
||||
|
||||
int64_t getMaxInlineSizeThreshold() const;
|
||||
Type *getMemcpyLoopLoweringType(
|
||||
LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
|
||||
unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
|
||||
|
@ -67,7 +67,6 @@ add_llvm_target(AMDGPUCodeGen
|
||||
AMDGPULegalizerInfo.cpp
|
||||
AMDGPULibCalls.cpp
|
||||
AMDGPULibFunc.cpp
|
||||
AMDGPULowerIntrinsics.cpp
|
||||
AMDGPULowerKernelArguments.cpp
|
||||
AMDGPULowerKernelAttributes.cpp
|
||||
AMDGPULowerModuleLDSPass.cpp
|
||||
|
@ -210,6 +210,10 @@ public:
|
||||
|
||||
InstructionCost getMemcpyCost(const Instruction *I);
|
||||
|
||||
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
|
||||
return ST->getMaxInlineSizeThreshold();
|
||||
}
|
||||
|
||||
int getNumMemOps(const IntrinsicInst *I) const;
|
||||
|
||||
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
|
||||
|
@ -273,6 +273,11 @@ public:
|
||||
const Function *Callee) const;
|
||||
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
|
||||
const ArrayRef<Type *> &Type) const;
|
||||
|
||||
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
|
||||
return ST->getMaxInlineSizeThreshold();
|
||||
}
|
||||
|
||||
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
|
||||
bool IsZeroCmp) const;
|
||||
bool prefersVectorizedAddressing() const;
|
||||
|
@ -59,62 +59,40 @@ define amdgpu_kernel void @kernel_caller_stack() {
|
||||
|
||||
define amdgpu_kernel void @kernel_caller_byval() {
|
||||
; MUBUF-LABEL: kernel_caller_byval:
|
||||
; MUBUF: ; %bb.0:
|
||||
; MUBUF: ; %bb.0: ; %loadstoreloop.preheader
|
||||
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
||||
; MUBUF-NEXT: s_mov_b32 s5, 0
|
||||
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
||||
; MUBUF-NEXT: s_movk_i32 s4, 0x80
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132
|
||||
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12
|
||||
; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16
|
||||
; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20
|
||||
; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24
|
||||
; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28
|
||||
; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32
|
||||
; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36
|
||||
; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40
|
||||
; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44
|
||||
; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48
|
||||
; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52
|
||||
; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56
|
||||
; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60
|
||||
; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64
|
||||
; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, s5
|
||||
; MUBUF-NEXT: s_movk_i32 s32, 0x1400
|
||||
; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop
|
||||
; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; MUBUF-NEXT: v_add_u32_e32 v2, 4, v1
|
||||
; MUBUF-NEXT: v_add_u32_e32 v1, 1, v1
|
||||
; MUBUF-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
|
||||
; MUBUF-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: s_cbranch_vccnz .LBB1_1
|
||||
; MUBUF-NEXT: ; %bb.2: ; %split
|
||||
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
|
||||
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
|
||||
; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12
|
||||
; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16
|
||||
; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:20
|
||||
; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:24
|
||||
; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:28
|
||||
; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:32
|
||||
; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:36
|
||||
; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:40
|
||||
; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:44
|
||||
; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:48
|
||||
; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:52
|
||||
; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:56
|
||||
; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:60
|
||||
; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:64
|
||||
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
|
||||
@ -154,38 +132,31 @@ define amdgpu_kernel void @kernel_caller_byval() {
|
||||
; MUBUF-NEXT: s_endpgm
|
||||
;
|
||||
; FLATSCR-LABEL: kernel_caller_byval:
|
||||
; FLATSCR: ; %bb.0:
|
||||
; FLATSCR: ; %bb.0: ; %loadstoreloop.preheader
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:32
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:40
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:48
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:56
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:64
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:72
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:80
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:96
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:128
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8
|
||||
; FLATSCR-NEXT: s_nop 0
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:16
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:24
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:32
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:40
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:48
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:56
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:64
|
||||
; FLATSCR-NEXT: s_mov_b32 s1, 0
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x80
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FLATSCR-NEXT: s_movk_i32 s32, 0x50
|
||||
; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop
|
||||
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v2, 4, v1
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v1, 1, v1
|
||||
; FLATSCR-NEXT: v_cmp_gt_u32_e32 vcc, s0, v1
|
||||
; FLATSCR-NEXT: scratch_store_byte v2, v0, off
|
||||
; FLATSCR-NEXT: s_cbranch_vccnz .LBB1_1
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %split
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:12
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:20
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:28
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:36
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:44
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:52
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:60
|
||||
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
|
||||
|
@ -1,6 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
|
||||
|
||||
declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
|
||||
|
||||
declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
|
||||
|
||||
declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
|
||||
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
|
||||
|
||||
declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
|
||||
|
||||
|
@ -34,7 +34,6 @@
|
||||
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O0-NEXT: FunctionPass Manager
|
||||
; GCN-O0-NEXT: Early propagate attributes from kernels to functions
|
||||
; GCN-O0-NEXT: AMDGPU Lower Intrinsics
|
||||
; GCN-O0-NEXT: AMDGPU Inline All Functions
|
||||
; GCN-O0-NEXT: Inliner for always_inline functions
|
||||
; GCN-O0-NEXT: FunctionPass Manager
|
||||
@ -182,7 +181,6 @@
|
||||
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
; GCN-O1-NEXT: Early propagate attributes from kernels to functions
|
||||
; GCN-O1-NEXT: AMDGPU Lower Intrinsics
|
||||
; GCN-O1-NEXT: AMDGPU Inline All Functions
|
||||
; GCN-O1-NEXT: Inliner for always_inline functions
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
@ -458,7 +456,6 @@
|
||||
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Lower Intrinsics
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
|
||||
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
@ -766,7 +763,6 @@
|
||||
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: Early propagate attributes from kernels to functions
|
||||
; GCN-O2-NEXT: AMDGPU Lower Intrinsics
|
||||
; GCN-O2-NEXT: AMDGPU Inline All Functions
|
||||
; GCN-O2-NEXT: Inliner for always_inline functions
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
@ -1077,7 +1073,6 @@
|
||||
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: Early propagate attributes from kernels to functions
|
||||
; GCN-O3-NEXT: AMDGPU Lower Intrinsics
|
||||
; GCN-O3-NEXT: AMDGPU Inline All Functions
|
||||
; GCN-O3-NEXT: Inliner for always_inline functions
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
|
@ -1,10 +1,10 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
|
||||
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
|
||||
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
|
||||
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
|
||||
|
||||
; Test the -amdgpu-mem-intrinsic-expand-size flag works.
|
||||
; Test the -mem-intrinsic-expand-size flag works.
|
||||
|
||||
; Make sure we can always eliminate the intrinsic, even at 0.
|
||||
define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
|
||||
@ -17,19 +17,19 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
|
||||
; OPT4-NEXT: ret void
|
||||
;
|
||||
; OPT0-LABEL: @memset_size_0(
|
||||
; OPT0-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
|
||||
; OPT0-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
|
||||
; OPT0: loadstoreloop:
|
||||
; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
|
||||
; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
|
||||
; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
|
||||
; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
|
||||
; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
|
||||
; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
|
||||
; OPT0: split:
|
||||
; OPT0-NEXT: ret void
|
||||
;
|
||||
; OPT_NEG-LABEL: @memset_size_0(
|
||||
; OPT_NEG-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
|
||||
; OPT_NEG: loadstoreloop:
|
||||
; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
|
||||
; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
|
||||
; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
|
||||
; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
|
||||
; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
|
||||
; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
|
||||
; OPT_NEG: split:
|
||||
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
|
||||
; OPT_NEG-NEXT: ret void
|
||||
;
|
||||
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 0, i1 false)
|
||||
@ -58,15 +58,7 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) {
|
||||
; OPT0-NEXT: ret void
|
||||
;
|
||||
; OPT_NEG-LABEL: @memset_size_4(
|
||||
; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
|
||||
; OPT_NEG: loadstoreloop:
|
||||
; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
|
||||
; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
|
||||
; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
|
||||
; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
|
||||
; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
|
||||
; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
|
||||
; OPT_NEG: split:
|
||||
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
|
||||
; OPT_NEG-NEXT: ret void
|
||||
;
|
||||
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 4, i1 false)
|
||||
@ -103,15 +95,7 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) {
|
||||
; OPT0-NEXT: ret void
|
||||
;
|
||||
; OPT_NEG-LABEL: @memset_size_8(
|
||||
; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
|
||||
; OPT_NEG: loadstoreloop:
|
||||
; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
|
||||
; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
|
||||
; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
|
||||
; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
|
||||
; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
|
||||
; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
|
||||
; OPT_NEG: split:
|
||||
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
|
||||
; OPT_NEG-NEXT: ret void
|
||||
;
|
||||
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 8, i1 false)
|
||||
|
@ -1,6 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
|
||||
|
||||
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
|
||||
declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
|
||||
|
@ -1,5 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
|
||||
; RUN: opt -S -passes=always-inline -o %t.bc %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK
|
||||
|
||||
; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
|
||||
; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
|
||||
|
@ -1,5 +1,5 @@
|
||||
; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s
|
||||
; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: not llc -mtriple=amdgcn-- < %s 2>&1 | FileCheck -check-prefix=ERROR %s
|
||||
; RUN: not llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1
|
||||
|
||||
|
@ -153,7 +153,6 @@ static_library("LLVMAMDGPUCodeGen") {
|
||||
"AMDGPULegalizerInfo.cpp",
|
||||
"AMDGPULibCalls.cpp",
|
||||
"AMDGPULibFunc.cpp",
|
||||
"AMDGPULowerIntrinsics.cpp",
|
||||
"AMDGPULowerKernelArguments.cpp",
|
||||
"AMDGPULowerKernelAttributes.cpp",
|
||||
"AMDGPULowerModuleLDSPass.cpp",
|
||||
|
Loading…
Reference in New Issue
Block a user