CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering

Expand large or unknown size memory intrinsics into loops in the
default lowering pipeline if the target doesn't have the corresponding
libfunc. Previously AMDGPU had a custom pass which existed to call the
expansion utilities.

With a default no-libcall option, we can remove the libfunc checks in
LoopIdiomRecognize for these, which never made any sense. This also
provides a path to lifting the immarg restriction on
llvm.memcpy.inline.

There seems to be a bug where TLI reports functions as available if
you use -march and not -mtriple.
This commit is contained in:
Matt Arsenault 2023-06-07 09:03:17 -04:00
parent 2e16df352c
commit 3c848194f2
25 changed files with 283 additions and 296 deletions

View File

@ -268,14 +268,12 @@ Changes to the C API
* ``LLVMConstSelect``
Changes to the FastISel infrastructure
--------------------------------------
* ...
Changes to the DAG infrastructure
---------------------------------
Changes to the CodeGen infrastructure
-------------------------------------
* ``llvm.memcpy``, ``llvm.memmove`` and ``llvm.memset`` are now
expanded into loops by default for targets which do not report the
corresponding library function is available.
Changes to the Metadata Info
---------------------------------

View File

@ -356,6 +356,10 @@ public:
/// source/destination type and alignment and the number of bytes copied.
InstructionCost getMemcpyCost(const Instruction *I) const;
/// Returns the maximum memset / memcpy size in bytes that still makes it
/// profitable to inline the call.
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const;
/// \return The estimated number of case clusters when lowering \p 'SI'.
/// \p JTSize Set a jump table size only when \p SI is suitable for a jump
/// table.
@ -1673,6 +1677,7 @@ public:
virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
virtual int getInlinerVectorBonusPercent() const = 0;
virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
virtual uint64_t getMaxMemIntrinsicInlineSizeThreshold() const = 0;
virtual unsigned
getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
ProfileSummaryInfo *PSI,
@ -2044,6 +2049,11 @@ public:
InstructionCost getMemcpyCost(const Instruction *I) override {
return Impl.getMemcpyCost(I);
}
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
return Impl.getMaxMemIntrinsicInlineSizeThreshold();
}
InstructionCost getInstructionCost(const User *U,
ArrayRef<const Value *> Operands,
TargetCostKind CostKind) override {

View File

@ -77,6 +77,10 @@ public:
return TTI::TCC_Expensive;
}
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
return 64;
}
// Although this default value is arbitrary, it is not random. It is assumed
// that a condition that evaluates the same way by a higher percentage than
// this is best represented as control flow. Therefore, the default value N

View File

@ -1035,6 +1035,10 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
return Cost;
}
uint64_t TargetTransformInfo::getMaxMemIntrinsicInlineSizeThreshold() const {
return TTIImpl->getMaxMemIntrinsicInlineSizeThreshold();
}
InstructionCost TargetTransformInfo::getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) const {

View File

@ -6,14 +6,16 @@
//
//===----------------------------------------------------------------------===//
//
// This pass implements IR lowering for the llvm.load.relative and llvm.objc.*
// intrinsics.
// This pass implements IR lowering for the llvm.memcpy, llvm.memmove,
// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics.
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
#include "llvm/Analysis/ObjCARCInstKind.h"
#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
@ -24,9 +26,44 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
using namespace llvm;
/// Threshold to leave statically sized memory intrinsic calls. Calls of known
/// size larger than this will be expanded by the pass. Calls of unknown or
/// lower size will be left for expansion in codegen.
static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
"mem-intrinsic-expand-size",
cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1),
cl::Hidden);
namespace {
struct PreISelIntrinsicLowering {
const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
const function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo;
/// If this is true, assume it's preferably to leave memory intrinsic calls
/// for replacement with a library call later. Otherwise this depends on
/// TargetLibraryInfo availability of the corresponding function.
const bool UseMemIntrinsicLibFunc;
explicit PreISelIntrinsicLowering(
function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo_,
bool UseMemIntrinsicLibFunc_ = true)
: LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_),
UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
static bool shouldExpandMemIntrinsicWithSize(Value *Size,
const TargetTransformInfo &TTI);
bool expandMemIntrinsicUses(Function &F) const;
bool lowerIntrinsics(Module &M) const;
};
} // namespace
static bool lowerLoadRelative(Function &F) {
if (F.use_empty())
return false;
@ -133,12 +170,100 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
return true;
}
static bool lowerIntrinsics(Module &M) {
// TODO: Should refine based on estimated number of accesses (e.g. does it
// require splitting based on alignment)
bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize(
Value *Size, const TargetTransformInfo &TTI) {
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
if (!CI)
return true;
uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences()
? MemIntrinsicExpandSizeThresholdOpt
: TTI.getMaxMemIntrinsicInlineSizeThreshold();
uint64_t SizeVal = CI->getZExtValue();
// Treat a threshold of 0 as a special case to force expansion of all
// intrinsics, including size 0.
return SizeVal > Threshold || Threshold == 0;
}
// TODO: Handle atomic memcpy and memcpy.inline
// TODO: Pass ScalarEvolution
bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
Intrinsic::ID ID = F.getIntrinsicID();
bool Changed = false;
for (User *U : llvm::make_early_inc_range(F.users())) {
Instruction *Inst = cast<Instruction>(U);
switch (ID) {
case Intrinsic::memcpy: {
auto *Memcpy = cast<MemCpyInst>(Inst);
Function *ParentFunc = Memcpy->getFunction();
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
if (UseMemIntrinsicLibFunc &&
LookupLibInfo(*ParentFunc).has(LibFunc_memcpy))
break;
expandMemCpyAsLoop(Memcpy, TTI);
Changed = true;
Memcpy->eraseFromParent();
}
break;
}
case Intrinsic::memmove: {
auto *Memmove = cast<MemMoveInst>(Inst);
Function *ParentFunc = Memmove->getFunction();
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) {
if (UseMemIntrinsicLibFunc &&
LookupLibInfo(*ParentFunc).has(LibFunc_memmove))
break;
expandMemMoveAsLoop(Memmove);
Changed = true;
Memmove->eraseFromParent();
}
break;
}
case Intrinsic::memset: {
auto *Memset = cast<MemSetInst>(Inst);
Function *ParentFunc = Memset->getFunction();
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) {
if (UseMemIntrinsicLibFunc &&
LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset))
break;
expandMemSetAsLoop(Memset);
Changed = true;
Memset->eraseFromParent();
}
break;
}
default:
llvm_unreachable("unhandled intrinsic");
}
}
return Changed;
}
bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
bool Changed = false;
for (Function &F : M) {
switch (F.getIntrinsicID()) {
default:
break;
case Intrinsic::memcpy:
case Intrinsic::memmove:
case Intrinsic::memset:
Changed |= expandMemIntrinsicUses(F);
break;
case Intrinsic::load_relative:
Changed |= lowerLoadRelative(F);
break;
@ -230,7 +355,23 @@ public:
PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
bool runOnModule(Module &M) override { return lowerIntrinsics(M); }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
bool runOnModule(Module &M) override {
auto LookupTTI = [this](Function &F) -> TargetTransformInfo & {
return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
};
auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
};
PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
return Lowering.lowerIntrinsics(M);
}
};
} // end anonymous namespace
@ -247,7 +388,18 @@ ModulePass *llvm::createPreISelIntrinsicLoweringPass() {
PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
ModuleAnalysisManager &AM) {
if (!lowerIntrinsics(M))
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
return FAM.getResult<TargetLibraryAnalysis>(F);
};
auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
return FAM.getResult<TargetIRAnalysis>(F);
};
PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
if (!Lowering.lowerIntrinsics(M))
return PreservedAnalyses::all();
else
return PreservedAnalyses::none();

View File

@ -1088,8 +1088,8 @@ bool TargetPassConfig::addISelPasses() {
if (TM->useEmulatedTLS())
addPass(createLowerEmuTLSPass());
addPass(createPreISelIntrinsicLoweringPass());
PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
addPass(createPreISelIntrinsicLoweringPass());
addPass(createExpandLargeDivRemPass());
addPass(createExpandLargeFpConvertPass());
addIRPasses();

View File

@ -90,10 +90,6 @@ FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy);
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
extern char &AMDGPUAtomicOptimizerID;
ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
ModulePass *createAMDGPUCtorDtorLoweringLegacyPass();
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &);
extern char &AMDGPUCtorDtorLoweringLegacyPassID;

View File

@ -1,144 +0,0 @@
//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
using namespace llvm;
namespace {
static int MaxStaticSize;
static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
"amdgpu-mem-intrinsic-expand-size",
cl::desc("Set minimum mem intrinsic size to expand in IR"),
cl::location(MaxStaticSize),
cl::init(1024),
cl::Hidden);
class AMDGPULowerIntrinsics : public ModulePass {
public:
static char ID;
AMDGPULowerIntrinsics() : ModulePass(ID) {}
bool runOnModule(Module &M) override;
bool expandMemIntrinsicUses(Function &F);
StringRef getPassName() const override {
return "AMDGPU Lower Intrinsics";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetTransformInfoWrapperPass>();
}
};
}
char AMDGPULowerIntrinsics::ID = 0;
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
false)
// TODO: Should refine based on estimated number of accesses (e.g. does it
// require splitting based on alignment)
static bool shouldExpandOperationWithSize(Value *Size) {
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
return !CI || (CI->getSExtValue() > MaxStaticSize);
}
bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
Intrinsic::ID ID = F.getIntrinsicID();
bool Changed = false;
for (User *U : llvm::make_early_inc_range(F.users())) {
Instruction *Inst = cast<Instruction>(U);
switch (ID) {
case Intrinsic::memcpy: {
auto *Memcpy = cast<MemCpyInst>(Inst);
if (shouldExpandOperationWithSize(Memcpy->getLength())) {
Function *ParentFunc = Memcpy->getParent()->getParent();
const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
expandMemCpyAsLoop(Memcpy, TTI);
Changed = true;
Memcpy->eraseFromParent();
}
break;
}
case Intrinsic::memmove: {
auto *Memmove = cast<MemMoveInst>(Inst);
if (shouldExpandOperationWithSize(Memmove->getLength())) {
expandMemMoveAsLoop(Memmove);
Changed = true;
Memmove->eraseFromParent();
}
break;
}
case Intrinsic::memset: {
auto *Memset = cast<MemSetInst>(Inst);
if (shouldExpandOperationWithSize(Memset->getLength())) {
expandMemSetAsLoop(Memset);
Changed = true;
Memset->eraseFromParent();
}
break;
}
default:
break;
}
}
return Changed;
}
bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
bool Changed = false;
for (Function &F : M) {
if (!F.isDeclaration())
continue;
switch (F.getIntrinsicID()) {
case Intrinsic::memcpy:
case Intrinsic::memmove:
case Intrinsic::memset:
if (expandMemIntrinsicUses(F))
Changed = true;
break;
default:
break;
}
}
return Changed;
}
ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
return new AMDGPULowerIntrinsics();
}

View File

@ -388,7 +388,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
@ -998,8 +997,6 @@ void AMDGPUPassConfig::addIRPasses() {
// A call to propagate attributes pass in the backend in case opt was not run.
addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
addPass(createAMDGPULowerIntrinsicsPass());
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());

View File

@ -267,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getPeelingPreferences(L, SE, PP);
}
int64_t AMDGPUTTIImpl::getMaxInlineSizeThreshold() const {
return 1024;
}
const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
// Codegen control options which don't matter.
AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
@ -395,6 +399,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
int64_t GCNTTIImpl::getMaxInlineSizeThreshold() const {
return 1024;
}
// FIXME: Really we would like to issue multiple 128-bit loads and stores per
// iteration. Should we report a larger size and let it legalize?
//

View File

@ -55,6 +55,8 @@ public:
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);
int64_t getMaxInlineSizeThreshold() const;
};
class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
@ -132,6 +134,8 @@ public:
unsigned AddrSpace) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
int64_t getMaxInlineSizeThreshold() const;
Type *getMemcpyLoopLoweringType(
LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,

View File

@ -67,7 +67,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp
AMDGPULibFunc.cpp
AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp

View File

@ -210,6 +210,10 @@ public:
InstructionCost getMemcpyCost(const Instruction *I);
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
return ST->getMaxInlineSizeThreshold();
}
int getNumMemOps(const IntrinsicInst *I) const;
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,

View File

@ -273,6 +273,11 @@ public:
const Function *Callee) const;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
const ArrayRef<Type *> &Type) const;
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
return ST->getMaxInlineSizeThreshold();
}
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool prefersVectorizedAddressing() const;

View File

@ -59,62 +59,40 @@ define amdgpu_kernel void @kernel_caller_stack() {
define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-LABEL: kernel_caller_byval:
; MUBUF: ; %bb.0:
; MUBUF: ; %bb.0: ; %loadstoreloop.preheader
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s7
; MUBUF-NEXT: s_mov_b32 s5, 0
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: s_movk_i32 s4, 0x80
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12
; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16
; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20
; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24
; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28
; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32
; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36
; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40
; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44
; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48
; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52
; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56
; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60
; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64
; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68
; MUBUF-NEXT: v_mov_b32_e32 v1, s5
; MUBUF-NEXT: s_movk_i32 s32, 0x1400
; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop
; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
; MUBUF-NEXT: v_add_u32_e32 v2, 4, v1
; MUBUF-NEXT: v_add_u32_e32 v1, 1, v1
; MUBUF-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
; MUBUF-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_cbranch_vccnz .LBB1_1
; MUBUF-NEXT: ; %bb.2: ; %split
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12
; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16
; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:20
; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:24
; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:28
; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:32
; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:36
; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:40
; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:44
; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:48
; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:52
; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:56
; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:60
; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:64
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
@ -154,38 +132,31 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-NEXT: s_endpgm
;
; FLATSCR-LABEL: kernel_caller_byval:
; FLATSCR: ; %bb.0:
; FLATSCR: ; %bb.0: ; %loadstoreloop.preheader
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:32
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:40
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:48
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:56
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:64
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:72
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:80
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:96
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:128
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8
; FLATSCR-NEXT: s_nop 0
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:16
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:24
; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:32
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:40
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:48
; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:56
; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:64
; FLATSCR-NEXT: s_mov_b32 s1, 0
; FLATSCR-NEXT: s_movk_i32 s0, 0x80
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: s_movk_i32 s32, 0x50
; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: v_add_u32_e32 v2, 4, v1
; FLATSCR-NEXT: v_add_u32_e32 v1, 1, v1
; FLATSCR-NEXT: v_cmp_gt_u32_e32 vcc, s0, v1
; FLATSCR-NEXT: scratch_store_byte v2, v0, off
; FLATSCR-NEXT: s_cbranch_vccnz .LBB1_1
; FLATSCR-NEXT: ; %bb.2: ; %split
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:12
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:20
; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:28
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:36
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:44
; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:52
; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:60
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)

View File

@ -34,7 +34,6 @@
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Early propagate attributes from kernels to functions
; GCN-O0-NEXT: AMDGPU Lower Intrinsics
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
; GCN-O0-NEXT: FunctionPass Manager
@ -182,7 +181,6 @@
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Early propagate attributes from kernels to functions
; GCN-O1-NEXT: AMDGPU Lower Intrinsics
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
; GCN-O1-NEXT: FunctionPass Manager
@ -458,7 +456,6 @@
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions
; GCN-O1-OPTS-NEXT: AMDGPU Lower Intrinsics
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
; GCN-O1-OPTS-NEXT: FunctionPass Manager
@ -766,7 +763,6 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Early propagate attributes from kernels to functions
; GCN-O2-NEXT: AMDGPU Lower Intrinsics
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
; GCN-O2-NEXT: FunctionPass Manager
@ -1077,7 +1073,6 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Early propagate attributes from kernels to functions
; GCN-O3-NEXT: AMDGPU Lower Intrinsics
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
; GCN-O3-NEXT: FunctionPass Manager

View File

@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
; Test the -amdgpu-mem-intrinsic-expand-size flag works.
; Test the -mem-intrinsic-expand-size flag works.
; Make sure we can always eliminate the intrinsic, even at 0.
define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
@ -17,19 +17,19 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_0(
; OPT0-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
; OPT0-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
; OPT0: loadstoreloop:
; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
; OPT0: split:
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_0(
; OPT_NEG-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
; OPT_NEG: loadstoreloop:
; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
; OPT_NEG: split:
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 0, i1 false)
@ -58,15 +58,7 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) {
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_4(
; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
; OPT_NEG: loadstoreloop:
; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
; OPT_NEG: split:
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 4, i1 false)
@ -103,15 +95,7 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) {
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_8(
; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
; OPT_NEG: loadstoreloop:
; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
; OPT_NEG: split:
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 8, i1 false)

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
; RUN: opt -S -passes=always-inline -o %t.bc %s
; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK
; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,

View File

@ -1,5 +1,5 @@
; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: not llc -mtriple=amdgcn-- < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; RUN: not llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1

View File

@ -153,7 +153,6 @@ static_library("LLVMAMDGPUCodeGen") {
"AMDGPULegalizerInfo.cpp",
"AMDGPULibCalls.cpp",
"AMDGPULibFunc.cpp",
"AMDGPULowerIntrinsics.cpp",
"AMDGPULowerKernelArguments.cpp",
"AMDGPULowerKernelAttributes.cpp",
"AMDGPULowerModuleLDSPass.cpp",