mirror of
https://github.com/RPCS3/llvm.git
synced 2026-01-31 01:25:19 +01:00
Summary: For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD. This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions. Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug. Reviewers: mareko, arsenm, tstellarAMD, nhaehnle Subscribers: FireBurn, kerberizer, llvm-commits, arsenm Differential Revision: http://reviews.llvm.org/D18340 Patch By: Bas Nieuwenhuizen git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@266337 91177308-0d34-0410-b5e6-96231b3b80d8
660 lines
20 KiB
C++
660 lines
20 KiB
C++
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass eliminates allocas by either converting them into vectors or
|
|
// by migrating them to local address space.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/MDBuilder.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-promote-alloca"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
// FIXME: This can create globals so should be a module pass.
|
|
class AMDGPUPromoteAlloca : public FunctionPass {
|
|
private:
|
|
const TargetMachine *TM;
|
|
Module *Mod;
|
|
MDNode *MaxWorkGroupSizeRange;
|
|
|
|
// FIXME: This should be per-kernel.
|
|
int LocalMemAvailable;
|
|
|
|
bool IsAMDGCN;
|
|
bool IsAMDHSA;
|
|
|
|
std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
|
|
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
|
|
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
|
|
FunctionPass(ID),
|
|
TM(TM_),
|
|
Mod(nullptr),
|
|
MaxWorkGroupSizeRange(nullptr),
|
|
LocalMemAvailable(0),
|
|
IsAMDGCN(false),
|
|
IsAMDHSA(false) { }
|
|
|
|
bool doInitialization(Module &M) override;
|
|
bool runOnFunction(Function &F) override;
|
|
|
|
const char *getPassName() const override {
|
|
return "AMDGPU Promote Alloca";
|
|
}
|
|
|
|
void handleAlloca(AllocaInst &I);
|
|
};
|
|
|
|
} // End anonymous namespace
|
|
|
|
char AMDGPUPromoteAlloca::ID = 0;
|
|
|
|
INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
|
|
"AMDGPU promote alloca to vector or LDS", false, false)
|
|
|
|
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
|
|
|
|
|
|
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
|
|
if (!TM)
|
|
return false;
|
|
|
|
Mod = &M;
|
|
|
|
// The maximum workitem id.
|
|
//
|
|
// FIXME: Should get as subtarget property. Usually runtime enforced max is
|
|
// 256.
|
|
MDBuilder MDB(Mod->getContext());
|
|
MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
|
|
|
|
const Triple &TT = TM->getTargetTriple();
|
|
|
|
IsAMDGCN = TT.getArch() == Triple::amdgcn;
|
|
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
|
|
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
|
if (!TM || F.hasFnAttribute(Attribute::OptimizeNone))
|
|
return false;
|
|
|
|
FunctionType *FTy = F.getFunctionType();
|
|
|
|
// If the function has any arguments in the local address space, then it's
|
|
// possible these arguments require the entire local memory space, so
|
|
// we cannot use local memory in the pass.
|
|
for (Type *ParamTy : FTy->params()) {
|
|
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
|
|
if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
|
LocalMemAvailable = 0;
|
|
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
|
|
"local memory disabled.\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
|
|
LocalMemAvailable = ST.getLocalMemorySize();
|
|
if (LocalMemAvailable == 0)
|
|
return false;
|
|
|
|
// Check how much local memory is being used by global objects
|
|
for (GlobalVariable &GV : Mod->globals()) {
|
|
if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
|
|
continue;
|
|
|
|
for (Use &U : GV.uses()) {
|
|
Instruction *Use = dyn_cast<Instruction>(U);
|
|
if (!Use)
|
|
continue;
|
|
|
|
if (Use->getParent()->getParent() == &F)
|
|
LocalMemAvailable -=
|
|
Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
|
|
}
|
|
}
|
|
|
|
LocalMemAvailable = std::max(0, LocalMemAvailable);
|
|
DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
|
|
|
|
BasicBlock &EntryBB = *F.begin();
|
|
for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
|
|
AllocaInst *AI = dyn_cast<AllocaInst>(I);
|
|
|
|
++I;
|
|
if (AI)
|
|
handleAlloca(*AI);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
std::pair<Value *, Value *>
|
|
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
|
if (!IsAMDHSA) {
|
|
Function *LocalSizeYFn
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
|
|
Function *LocalSizeZFn
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
|
|
|
|
CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
|
|
CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
|
|
|
|
LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
|
|
return std::make_pair(LocalSizeY, LocalSizeZ);
|
|
}
|
|
|
|
// We must read the size out of the dispatch pointer.
|
|
assert(IsAMDGCN);
|
|
|
|
// We are indexing into this struct, and want to extract the workgroup_size_*
|
|
// fields.
|
|
//
|
|
// typedef struct hsa_kernel_dispatch_packet_s {
|
|
// uint16_t header;
|
|
// uint16_t setup;
|
|
// uint16_t workgroup_size_x ;
|
|
// uint16_t workgroup_size_y;
|
|
// uint16_t workgroup_size_z;
|
|
// uint16_t reserved0;
|
|
// uint32_t grid_size_x ;
|
|
// uint32_t grid_size_y ;
|
|
// uint32_t grid_size_z;
|
|
//
|
|
// uint32_t private_segment_size;
|
|
// uint32_t group_segment_size;
|
|
// uint64_t kernel_object;
|
|
//
|
|
// #ifdef HSA_LARGE_MODEL
|
|
// void *kernarg_address;
|
|
// #elif defined HSA_LITTLE_ENDIAN
|
|
// void *kernarg_address;
|
|
// uint32_t reserved1;
|
|
// #else
|
|
// uint32_t reserved1;
|
|
// void *kernarg_address;
|
|
// #endif
|
|
// uint64_t reserved2;
|
|
// hsa_signal_t completion_signal; // uint64_t wrapper
|
|
// } hsa_kernel_dispatch_packet_t
|
|
//
|
|
Function *DispatchPtrFn
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
|
|
|
|
CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
|
|
DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
|
|
DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
|
|
|
|
// Size of the dispatch packet struct.
|
|
DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
|
|
|
|
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
|
|
Value *CastDispatchPtr = Builder.CreateBitCast(
|
|
DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
|
|
|
|
// We could do a single 64-bit load here, but it's likely that the basic
|
|
// 32-bit and extract sequence is already present, and it is probably easier
|
|
// to CSE this. The loads should be mergable later anyway.
|
|
Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
|
|
LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
|
|
|
|
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
|
|
LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
|
|
|
|
MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
|
|
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
|
|
// Extract y component. Upper half of LoadZU should be zero already.
|
|
Value *Y = Builder.CreateLShr(LoadXY, 16);
|
|
|
|
return std::make_pair(Y, LoadZU);
|
|
}
|
|
|
|
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
|
|
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
|
|
|
|
switch (N) {
|
|
case 0:
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
|
|
: Intrinsic::r600_read_tidig_x;
|
|
break;
|
|
case 1:
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
|
|
: Intrinsic::r600_read_tidig_y;
|
|
break;
|
|
|
|
case 2:
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
|
|
: Intrinsic::r600_read_tidig_z;
|
|
break;
|
|
default:
|
|
llvm_unreachable("invalid dimension");
|
|
}
|
|
|
|
Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
|
|
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
|
|
CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
|
|
return CI;
|
|
}
|
|
|
|
static VectorType *arrayTypeToVecType(Type *ArrayTy) {
|
|
return VectorType::get(ArrayTy->getArrayElementType(),
|
|
ArrayTy->getArrayNumElements());
|
|
}
|
|
|
|
static Value *
|
|
calculateVectorIndex(Value *Ptr,
|
|
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
|
|
if (isa<AllocaInst>(Ptr))
|
|
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
|
|
|
|
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
|
|
|
|
auto I = GEPIdx.find(GEP);
|
|
return I == GEPIdx.end() ? nullptr : I->second;
|
|
}
|
|
|
|
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
|
|
// FIXME we only support simple cases
|
|
if (GEP->getNumOperands() != 3)
|
|
return NULL;
|
|
|
|
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
|
|
if (!I0 || !I0->isZero())
|
|
return NULL;
|
|
|
|
return GEP->getOperand(2);
|
|
}
|
|
|
|
// Not an instruction handled below to turn into a vector.
|
|
//
|
|
// TODO: Check isTriviallyVectorizable for calls and handle other
|
|
// instructions.
|
|
static bool canVectorizeInst(Instruction *Inst, User *User) {
|
|
switch (Inst->getOpcode()) {
|
|
case Instruction::Load:
|
|
case Instruction::BitCast:
|
|
case Instruction::AddrSpaceCast:
|
|
return true;
|
|
case Instruction::Store: {
|
|
// Must be the stored pointer operand, not a stored value.
|
|
StoreInst *SI = cast<StoreInst>(Inst);
|
|
return SI->getPointerOperand() == User;
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
|
|
ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
|
|
|
|
DEBUG(dbgs() << "Alloca candidate for vectorization\n");
|
|
|
|
// FIXME: There is no reason why we can't support larger arrays, we
|
|
// are just being conservative for now.
|
|
if (!AllocaTy ||
|
|
AllocaTy->getElementType()->isVectorTy() ||
|
|
AllocaTy->getNumElements() > 4) {
|
|
DEBUG(dbgs() << " Cannot convert type to vector\n");
|
|
return false;
|
|
}
|
|
|
|
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
|
|
std::vector<Value*> WorkList;
|
|
for (User *AllocaUser : Alloca->users()) {
|
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
|
|
if (!GEP) {
|
|
if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
|
|
return false;
|
|
|
|
WorkList.push_back(AllocaUser);
|
|
continue;
|
|
}
|
|
|
|
Value *Index = GEPToVectorIndex(GEP);
|
|
|
|
// If we can't compute a vector index from this GEP, then we can't
|
|
// promote this alloca to vector.
|
|
if (!Index) {
|
|
DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
|
|
return false;
|
|
}
|
|
|
|
GEPVectorIdx[GEP] = Index;
|
|
for (User *GEPUser : AllocaUser->users()) {
|
|
if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
|
|
return false;
|
|
|
|
WorkList.push_back(GEPUser);
|
|
}
|
|
}
|
|
|
|
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
|
|
|
|
DEBUG(dbgs() << " Converting alloca to vector "
|
|
<< *AllocaTy << " -> " << *VectorTy << '\n');
|
|
|
|
for (Value *V : WorkList) {
|
|
Instruction *Inst = cast<Instruction>(V);
|
|
IRBuilder<> Builder(Inst);
|
|
switch (Inst->getOpcode()) {
|
|
case Instruction::Load: {
|
|
Value *Ptr = Inst->getOperand(0);
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
|
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
|
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
|
|
Inst->replaceAllUsesWith(ExtractElement);
|
|
Inst->eraseFromParent();
|
|
break;
|
|
}
|
|
case Instruction::Store: {
|
|
Value *Ptr = Inst->getOperand(1);
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
|
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
|
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
|
|
Inst->getOperand(0),
|
|
Index);
|
|
Builder.CreateStore(NewVecValue, BitCast);
|
|
Inst->eraseFromParent();
|
|
break;
|
|
}
|
|
case Instruction::BitCast:
|
|
case Instruction::AddrSpaceCast:
|
|
break;
|
|
|
|
default:
|
|
Inst->dump();
|
|
llvm_unreachable("Inconsistency in instructions promotable to vector");
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool isCallPromotable(CallInst *CI) {
|
|
// TODO: We might be able to handle some cases where the callee is a
|
|
// constantexpr bitcast of a function.
|
|
if (!CI->getCalledFunction())
|
|
return false;
|
|
|
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
|
|
if (!II)
|
|
return false;
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
case Intrinsic::memcpy:
|
|
case Intrinsic::memmove:
|
|
case Intrinsic::memset:
|
|
case Intrinsic::lifetime_start:
|
|
case Intrinsic::lifetime_end:
|
|
case Intrinsic::invariant_start:
|
|
case Intrinsic::invariant_end:
|
|
case Intrinsic::invariant_group_barrier:
|
|
case Intrinsic::objectsize:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
|
|
for (User *User : Val->users()) {
|
|
if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
|
|
continue;
|
|
|
|
if (CallInst *CI = dyn_cast<CallInst>(User)) {
|
|
if (!isCallPromotable(CI))
|
|
return false;
|
|
|
|
WorkList.push_back(User);
|
|
continue;
|
|
}
|
|
|
|
Instruction *UseInst = dyn_cast<Instruction>(User);
|
|
if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
|
|
return false;
|
|
|
|
if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
|
|
if (SI->isVolatile())
|
|
return false;
|
|
|
|
// Reject if the stored value is not the pointer operand.
|
|
if (SI->getPointerOperand() != Val)
|
|
return false;
|
|
} else if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
|
|
if (LI->isVolatile())
|
|
return false;
|
|
} else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
|
|
if (RMW->isVolatile())
|
|
return false;
|
|
} else if (AtomicCmpXchgInst *CAS
|
|
= dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
|
|
if (CAS->isVolatile())
|
|
return false;
|
|
}
|
|
|
|
if (!User->getType()->isPointerTy())
|
|
continue;
|
|
|
|
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
|
|
// Be conservative if an address could be computed outside the bounds of
|
|
// the alloca.
|
|
if (!GEP->isInBounds())
|
|
return false;
|
|
}
|
|
|
|
WorkList.push_back(User);
|
|
if (!collectUsesWithPtrTypes(User, WorkList))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
|
|
if (!I.isStaticAlloca())
|
|
return;
|
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
// First try to replace the alloca with a vector
|
|
Type *AllocaTy = I.getAllocatedType();
|
|
|
|
DEBUG(dbgs() << "Trying to promote " << I << '\n');
|
|
|
|
if (tryPromoteAllocaToVector(&I))
|
|
return;
|
|
|
|
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
|
|
|
|
const Function &ContainingFunction = *I.getParent()->getParent();
|
|
|
|
// FIXME: We should also try to get this value from the reqd_work_group_size
|
|
// function attribute if it is available.
|
|
unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
|
|
|
|
int AllocaSize =
|
|
WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
|
|
|
|
if (AllocaSize > LocalMemAvailable) {
|
|
DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
|
|
return;
|
|
}
|
|
|
|
std::vector<Value*> WorkList;
|
|
|
|
if (!collectUsesWithPtrTypes(&I, WorkList)) {
|
|
DEBUG(dbgs() << " Do not know how to convert all uses\n");
|
|
return;
|
|
}
|
|
|
|
DEBUG(dbgs() << "Promoting alloca to local memory\n");
|
|
LocalMemAvailable -= AllocaSize;
|
|
|
|
Function *F = I.getParent()->getParent();
|
|
|
|
Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
|
|
GlobalVariable *GV = new GlobalVariable(
|
|
*Mod, GVTy, false, GlobalValue::InternalLinkage,
|
|
UndefValue::get(GVTy),
|
|
Twine(F->getName()) + Twine('.') + I.getName(),
|
|
nullptr,
|
|
GlobalVariable::NotThreadLocal,
|
|
AMDGPUAS::LOCAL_ADDRESS);
|
|
GV->setUnnamedAddr(true);
|
|
GV->setAlignment(I.getAlignment());
|
|
|
|
Value *TCntY, *TCntZ;
|
|
|
|
std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
|
|
Value *TIdX = getWorkitemID(Builder, 0);
|
|
Value *TIdY = getWorkitemID(Builder, 1);
|
|
Value *TIdZ = getWorkitemID(Builder, 2);
|
|
|
|
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
|
|
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
|
|
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
|
|
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
|
|
TID = Builder.CreateAdd(TID, TIdZ);
|
|
|
|
Value *Indices[] = {
|
|
Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
|
|
TID
|
|
};
|
|
|
|
Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
|
|
I.mutateType(Offset->getType());
|
|
I.replaceAllUsesWith(Offset);
|
|
I.eraseFromParent();
|
|
|
|
for (Value *V : WorkList) {
|
|
CallInst *Call = dyn_cast<CallInst>(V);
|
|
if (!Call) {
|
|
Type *EltTy = V->getType()->getPointerElementType();
|
|
PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
|
|
|
|
// The operand's value should be corrected on its own.
|
|
if (isa<AddrSpaceCastInst>(V))
|
|
continue;
|
|
|
|
// FIXME: It doesn't really make sense to try to do this for all
|
|
// instructions.
|
|
V->mutateType(NewTy);
|
|
continue;
|
|
}
|
|
|
|
IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
|
|
if (!Intr) {
|
|
// FIXME: What is this for? It doesn't make sense to promote arbitrary
|
|
// function calls. If the call is to a defined function that can also be
|
|
// promoted, we should be able to do this once that function is also
|
|
// rewritten.
|
|
|
|
std::vector<Type*> ArgTypes;
|
|
for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
|
|
ArgIdx != ArgEnd; ++ArgIdx) {
|
|
ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
|
|
}
|
|
Function *F = Call->getCalledFunction();
|
|
FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
|
|
F->isVarArg());
|
|
Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
|
|
NewType, F->getAttributes());
|
|
Function *NewF = cast<Function>(C);
|
|
Call->setCalledFunction(NewF);
|
|
continue;
|
|
}
|
|
|
|
Builder.SetInsertPoint(Intr);
|
|
switch (Intr->getIntrinsicID()) {
|
|
case Intrinsic::lifetime_start:
|
|
case Intrinsic::lifetime_end:
|
|
// These intrinsics are for address space 0 only
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
case Intrinsic::memcpy: {
|
|
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
|
|
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
|
|
MemCpy->getLength(), MemCpy->getAlignment(),
|
|
MemCpy->isVolatile());
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
}
|
|
case Intrinsic::memmove: {
|
|
MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
|
|
Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
|
|
MemMove->getLength(), MemMove->getAlignment(),
|
|
MemMove->isVolatile());
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
}
|
|
case Intrinsic::memset: {
|
|
MemSetInst *MemSet = cast<MemSetInst>(Intr);
|
|
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
|
|
MemSet->getLength(), MemSet->getAlignment(),
|
|
MemSet->isVolatile());
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
}
|
|
case Intrinsic::invariant_start:
|
|
case Intrinsic::invariant_end:
|
|
case Intrinsic::invariant_group_barrier:
|
|
Intr->eraseFromParent();
|
|
// FIXME: I think the invariant marker should still theoretically apply,
|
|
// but the intrinsics need to be changed to accept pointers with any
|
|
// address space.
|
|
continue;
|
|
case Intrinsic::objectsize: {
|
|
Value *Src = Intr->getOperand(0);
|
|
Type *SrcTy = Src->getType()->getPointerElementType();
|
|
Function *ObjectSize = Intrinsic::getDeclaration(Mod,
|
|
Intrinsic::objectsize,
|
|
{ Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
|
|
);
|
|
|
|
CallInst *NewCall
|
|
= Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
|
|
Intr->replaceAllUsesWith(NewCall);
|
|
Intr->eraseFromParent();
|
|
continue;
|
|
}
|
|
default:
|
|
Intr->dump();
|
|
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
|
|
}
|
|
}
|
|
}
|
|
|
|
FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
|
|
return new AMDGPUPromoteAlloca(TM);
|
|
}
|