llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
Matt Arsenault d5a79b9727 AMDGPU: Consolidate some getGeneration checks
This is incomplete, and ideally these would all be removed, but it's
better to localize them to the subtarget first with comments about
what they're for.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363902 91177308-0d34-0410-b5e6-96231b3b80d8
2019-06-19 23:54:58 +00:00

239 lines
8.4 KiB
C++

//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This pass replaces accesses to kernel arguments with loads from
/// offsets from the kernarg base pointer.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
using namespace llvm;
namespace {
class AMDGPULowerKernelArguments : public FunctionPass{
public:
static char ID;
AMDGPULowerKernelArguments() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.setPreservesAll();
}
};
} // end anonymous namespace
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
return false;
auto &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
LLVMContext &Ctx = F.getParent()->getContext();
const DataLayout &DL = F.getParent()->getDataLayout();
BasicBlock &EntryBlock = *F.begin();
IRBuilder<> Builder(&*EntryBlock.begin());
const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
unsigned MaxAlign;
// FIXME: Alignment is broken broken with explicit arg offset.;
const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
if (TotalKernArgSize == 0)
return false;
CallInst *KernArgSegment =
Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
nullptr, F.getName() + ".kernarg.segment");
KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
KernArgSegment->addAttribute(AttributeList::ReturnIndex,
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
uint64_t ExplicitArgOffset = 0;
for (Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
unsigned Align = DL.getABITypeAlignment(ArgTy);
unsigned Size = DL.getTypeSizeInBits(ArgTy);
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
if (Arg.use_empty())
continue;
if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
// FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
// can't represent this with range metadata because it's only allowed for
// integer types.
if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
!ST.hasUsableDSOffset())
continue;
// FIXME: We can replace this with equivalent alias.scope/noalias
// metadata, but this appears to be a lot of work.
if (Arg.hasNoAliasAttr())
continue;
}
VectorType *VT = dyn_cast<VectorType>(ArgTy);
bool IsV3 = VT && VT->getNumElements() == 3;
bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
VectorType *V4Ty = nullptr;
int64_t AlignDownOffset = alignDown(EltOffset, 4);
int64_t OffsetDiff = EltOffset - AlignDownOffset;
unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
KernArgBaseAlign);
Value *ArgPtr;
Type *AdjustedArgTy;
if (DoShiftOpt) { // FIXME: Handle aggregate types
// Since we don't have sub-dword scalar loads, avoid doing an extload by
// loading earlier than the argument address, and extracting the relevant
// bits.
//
// Additionally widen any sub-dword load to i32 even if suitably aligned,
// so that CSE between different argument loads works easily.
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
Arg.getName() + ".kernarg.offset.align.down");
AdjustedArgTy = Builder.getInt32Ty();
} else {
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
Builder.getInt8Ty(), KernArgSegment, EltOffset,
Arg.getName() + ".kernarg.offset");
AdjustedArgTy = ArgTy;
}
if (IsV3 && Size >= 32) {
V4Ty = VectorType::get(VT->getVectorElementType(), 4);
// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
AdjustedArgTy = V4Ty;
}
ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
ArgPtr->getName() + ".cast");
LoadInst *Load =
Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
MDBuilder MDB(Ctx);
if (isa<PointerType>(ArgTy)) {
if (Arg.hasNonNullAttr())
Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
uint64_t DerefBytes = Arg.getDereferenceableBytes();
if (DerefBytes != 0) {
Load->setMetadata(
LLVMContext::MD_dereferenceable,
MDNode::get(Ctx,
MDB.createConstant(
ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
}
uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
if (DerefOrNullBytes != 0) {
Load->setMetadata(
LLVMContext::MD_dereferenceable_or_null,
MDNode::get(Ctx,
MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
DerefOrNullBytes))));
}
unsigned ParamAlign = Arg.getParamAlignment();
if (ParamAlign != 0) {
Load->setMetadata(
LLVMContext::MD_align,
MDNode::get(Ctx,
MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
ParamAlign))));
}
}
// TODO: Convert noalias arg to !noalias
if (DoShiftOpt) {
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
IntegerType *ArgIntTy = Builder.getIntNTy(Size);
Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
Arg.getName() + ".load");
Arg.replaceAllUsesWith(NewVal);
} else if (IsV3) {
Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
{0, 1, 2},
Arg.getName() + ".load");
Arg.replaceAllUsesWith(Shuf);
} else {
Load->setName(Arg.getName() + ".load");
Arg.replaceAllUsesWith(Load);
}
}
KernArgSegment->addAttribute(
AttributeList::ReturnIndex,
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
return true;
}
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
"AMDGPU Lower Kernel Arguments", false, false)
INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
false, false)
char AMDGPULowerKernelArguments::ID = 0;
FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
return new AMDGPULowerKernelArguments();
}