mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-12-27 02:09:54 +00:00
GPGPU: Load GPU kernels
We embed the PTX code into the host IR as a global variable and compile it at run-time into a GPU kernel. llvm-svn: 276645
This commit is contained in:
parent
97de034e18
commit
5779359624
@ -281,7 +281,9 @@ private:
|
||||
///
|
||||
/// Free the LLVM-IR module corresponding to the kernel and -- if requested --
|
||||
/// dump its IR to stderr.
|
||||
void finalizeKernelFunction();
|
||||
///
|
||||
/// @returns The Assembly string of the kernel.
|
||||
std::string finalizeKernelFunction();
|
||||
|
||||
/// Create code that allocates memory to store arrays on device.
|
||||
void allocateDeviceArrays();
|
||||
@ -324,6 +326,19 @@ private:
|
||||
/// @param HostPtr A host pointer specifying the location to copy to.
|
||||
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
|
||||
Value *Size);
|
||||
|
||||
/// Create a call to get a kernel from an assembly string.
|
||||
///
|
||||
/// @param Buffer The string describing the kernel.
|
||||
/// @param Entry The name of the kernel function to call.
|
||||
///
|
||||
/// @returns A pointer to a kernel object
|
||||
Value *createCallGetKernel(Value *Buffer, Value *Entry);
|
||||
|
||||
/// Create a call to free a GPU kernel.
|
||||
///
|
||||
/// @param GPUKernel THe kernel to free.
|
||||
void createCallFreeKernel(Value *GPUKernel);
|
||||
};
|
||||
|
||||
void GPUNodeBuilder::initializeAfterRTH() {
|
||||
@ -360,6 +375,41 @@ void GPUNodeBuilder::freeDeviceArrays() {
|
||||
createCallFreeDeviceMemory(Array.second);
|
||||
}
|
||||
|
||||
Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
|
||||
const char *Name = "polly_getKernel";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
std::vector<Type *> Args;
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
|
||||
F = Function::Create(Ty, Linkage, Name, M);
|
||||
}
|
||||
|
||||
return Builder.CreateCall(F, {Buffer, Entry});
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
|
||||
const char *Name = "polly_freeKernel";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
std::vector<Type *> Args;
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||
F = Function::Create(Ty, Linkage, Name, M);
|
||||
}
|
||||
|
||||
Builder.CreateCall(F, {GPUKernel});
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
|
||||
const char *Name = "polly_freeDeviceMemory";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
@ -755,7 +805,12 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
||||
S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
|
||||
LocalArrays.clear();
|
||||
|
||||
finalizeKernelFunction();
|
||||
std::string ASMString = finalizeKernelFunction();
|
||||
std::string Name = "kernel_" + std::to_string(Kernel->id);
|
||||
Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
|
||||
Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
|
||||
Value *GPUKernel = createCallGetKernel(KernelString, NameString);
|
||||
createCallFreeKernel(GPUKernel);
|
||||
}
|
||||
|
||||
/// Compute the DataLayout string for the NVPTX backend.
|
||||
@ -943,7 +998,7 @@ std::string GPUNodeBuilder::createKernelASM() {
|
||||
return ASMStream.str();
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::finalizeKernelFunction() {
|
||||
std::string GPUNodeBuilder::finalizeKernelFunction() {
|
||||
// Verify module.
|
||||
llvm::legacy::PassManager Passes;
|
||||
Passes.add(createVerifierPass());
|
||||
@ -967,6 +1022,8 @@ void GPUNodeBuilder::finalizeKernelFunction() {
|
||||
|
||||
GPUModule.release();
|
||||
KernelIDs.clear();
|
||||
|
||||
return Assembly;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
@ -96,6 +96,8 @@
|
||||
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
|
||||
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
|
||||
; IR-NEXT: call i8* @polly_getKernel
|
||||
; IR-NEXT: call void @polly_freeKernel
|
||||
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
|
||||
; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
|
||||
|
@ -30,6 +30,8 @@
|
||||
|
||||
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
|
||||
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
|
||||
; IR-NEXT: call i8* @polly_getKernel
|
||||
; IR-NEXT: call void @polly_freeKernel
|
||||
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
|
||||
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
|
||||
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <dlfcn.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
static int DebugMode;
|
||||
|
||||
@ -36,12 +37,9 @@ struct PollyGPUContextT {
|
||||
CUcontext Cuda;
|
||||
};
|
||||
|
||||
struct PollyGPUModuleT {
|
||||
CUmodule Cuda;
|
||||
};
|
||||
|
||||
struct PollyGPUFunctionT {
|
||||
CUfunction Cuda;
|
||||
CUmodule CudaModule;
|
||||
};
|
||||
|
||||
struct PollyGPUDevicePtrT {
|
||||
@ -101,6 +99,10 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
|
||||
void **);
|
||||
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
|
||||
const void *image);
|
||||
static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
|
||||
const char *);
|
||||
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
|
||||
@ -111,6 +113,27 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
|
||||
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
|
||||
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
|
||||
CUjitInputType type, void *data,
|
||||
size_t size, const char *name,
|
||||
unsigned int numOptions,
|
||||
CUjit_option *options,
|
||||
void **optionValues);
|
||||
static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
|
||||
CUjit_option *options,
|
||||
void **optionValues,
|
||||
CUlinkState *stateOut);
|
||||
static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
|
||||
size_t *sizeOut);
|
||||
static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
|
||||
|
||||
typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
|
||||
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
|
||||
|
||||
/* Type-defines of function pointer ot CUDA runtime APIs. */
|
||||
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
|
||||
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
|
||||
@ -198,6 +221,9 @@ static int initialDeviceAPIs() {
|
||||
CuModuleLoadDataExFcnPtr =
|
||||
(CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
|
||||
|
||||
CuModuleLoadDataFcnPtr =
|
||||
(CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
|
||||
|
||||
CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
|
||||
HandleCuda, "cuModuleGetFunction");
|
||||
|
||||
@ -208,6 +234,18 @@ static int initialDeviceAPIs() {
|
||||
CuDeviceGetNameFcnPtr =
|
||||
(CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
|
||||
|
||||
CuLinkAddDataFcnPtr =
|
||||
(CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
|
||||
|
||||
CuLinkCreateFcnPtr =
|
||||
(CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
|
||||
|
||||
CuLinkCompleteFcnPtr =
|
||||
(CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
|
||||
|
||||
CuLinkDestroyFcnPtr =
|
||||
(CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
|
||||
|
||||
/* Get function pointer to CUDA Runtime APIs. */
|
||||
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
|
||||
HandleCudaRT, "cudaThreadSynchronize");
|
||||
@ -262,38 +300,93 @@ PollyGPUContext *polly_initContext() {
|
||||
return Context;
|
||||
}
|
||||
|
||||
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
|
||||
PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
|
||||
const char *KernelName) {
|
||||
dump_function();
|
||||
|
||||
*Module = malloc(sizeof(PollyGPUModule));
|
||||
if (*Module == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
|
||||
PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
|
||||
|
||||
if (Function == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) !=
|
||||
CUDA_SUCCESS) {
|
||||
CUresult Res;
|
||||
CUlinkState LState;
|
||||
CUjit_option Options[6];
|
||||
void *OptionVals[6];
|
||||
float Walltime = 0;
|
||||
unsigned long LogSize = 8192;
|
||||
char ErrorLog[8192], InfoLog[8192];
|
||||
void *CuOut;
|
||||
size_t OutSize;
|
||||
|
||||
// Setup linker options
|
||||
// Return walltime from JIT compilation
|
||||
Options[0] = CU_JIT_WALL_TIME;
|
||||
OptionVals[0] = (void *)&Walltime;
|
||||
// Pass a buffer for info messages
|
||||
Options[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||
OptionVals[1] = (void *)InfoLog;
|
||||
// Pass the size of the info buffer
|
||||
Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||
OptionVals[2] = (void *)LogSize;
|
||||
// Pass a buffer for error message
|
||||
Options[3] = CU_JIT_ERROR_LOG_BUFFER;
|
||||
OptionVals[3] = (void *)ErrorLog;
|
||||
// Pass the size of the error buffer
|
||||
Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
||||
OptionVals[4] = (void *)LogSize;
|
||||
// Make the linker verbose
|
||||
Options[5] = CU_JIT_LOG_VERBOSE;
|
||||
OptionVals[5] = (void *)1;
|
||||
|
||||
memset(ErrorLog, 0, sizeof(ErrorLog));
|
||||
|
||||
CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
|
||||
Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
|
||||
strlen(PTXBuffer) + 1, 0, 0, 0, 0);
|
||||
if (Res != CUDA_SUCCESS) {
|
||||
fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
|
||||
if (Res != CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Complete ptx linker step failed.\n");
|
||||
fprintf(stdout, "\n%s\n", ErrorLog);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
|
||||
InfoLog);
|
||||
|
||||
Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
|
||||
if (Res != CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Loading ptx assembly text failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
||||
PollyGPUFunction **Kernel) {
|
||||
dump_function();
|
||||
|
||||
*Kernel = malloc(sizeof(PollyGPUFunction));
|
||||
if (*Kernel == 0) {
|
||||
fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
/* Locate the kernel entry point. */
|
||||
if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) !=
|
||||
CUDA_SUCCESS) {
|
||||
Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
|
||||
KernelName);
|
||||
if (Res != CUDA_SUCCESS) {
|
||||
fprintf(stdout, "Loading kernel function failed.\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
CuLinkDestroyFcnPtr(LState);
|
||||
|
||||
return Function;
|
||||
}
|
||||
|
||||
void polly_freeKernel(PollyGPUFunction *Kernel) {
|
||||
dump_function();
|
||||
|
||||
if (Kernel->CudaModule)
|
||||
CuModuleUnloadFcnPtr(Kernel->CudaModule);
|
||||
|
||||
if (Kernel)
|
||||
free(Kernel);
|
||||
}
|
||||
|
||||
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
|
@ -44,7 +44,6 @@
|
||||
* const char *Entry = "_Z8myKernelPi";
|
||||
*
|
||||
* int main() {
|
||||
* PollyGPUModule *Module;
|
||||
* PollyGPUFunction *Kernel;
|
||||
* PollyGPUContext *Context;
|
||||
* PollyGPUDevicePtr *DevArray;
|
||||
@ -58,11 +57,11 @@
|
||||
* MemSize = 256*64*sizeof(int);
|
||||
* Context = polly_initContext();
|
||||
* DevArray = polly_allocateMemoryForDevice(MemSize);
|
||||
* polly_getPTXModule(KernelString, &Module);
|
||||
* polly_getPTXKernelEntry(Entry, Module, &Kernel);
|
||||
* Kernel = polly_getKernel(KernelString, KernelName);
|
||||
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
|
||||
* polly_launchKernel(Kernel, GridWidth, GridHeight);
|
||||
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
|
||||
* polly_freeKernel(Kernel);
|
||||
* polly_freeDeviceMemory(DevArray);
|
||||
* polly_freeContext(Context);
|
||||
* }
|
||||
@ -70,14 +69,13 @@
|
||||
*/
|
||||
|
||||
typedef struct PollyGPUContextT PollyGPUContext;
|
||||
typedef struct PollyGPUModuleT PollyGPUModule;
|
||||
typedef struct PollyGPUFunctionT PollyGPUFunction;
|
||||
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
|
||||
|
||||
PollyGPUContext *polly_initContext();
|
||||
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
|
||||
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
||||
PollyGPUFunction **Kernel);
|
||||
PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
|
||||
const char *KernelName);
|
||||
void polly_freeKernel(PollyGPUFunction *Kernel);
|
||||
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
long MemSize);
|
||||
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
|
Loading…
Reference in New Issue
Block a user