GPGPU: Load GPU kernels

We embed the PTX code into the host IR as a global variable and compile it
at run-time into a GPU kernel.

llvm-svn: 276645
This commit is contained in:
Tobias Grosser 2016-07-25 16:31:21 +00:00
parent 97de034e18
commit 5779359624
5 changed files with 186 additions and 34 deletions

View File

@ -281,7 +281,9 @@ private:
/// ///
/// Free the LLVM-IR module corresponding to the kernel and -- if requested -- /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
/// dump its IR to stderr. /// dump its IR to stderr.
void finalizeKernelFunction(); ///
/// @returns The Assembly string of the kernel.
std::string finalizeKernelFunction();
/// Create code that allocates memory to store arrays on device. /// Create code that allocates memory to store arrays on device.
void allocateDeviceArrays(); void allocateDeviceArrays();
@ -324,6 +326,19 @@ private:
/// @param HostPtr A host pointer specifying the location to copy to. /// @param HostPtr A host pointer specifying the location to copy to.
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
Value *Size); Value *Size);
/// Create a call to get a kernel from an assembly string.
///
/// @param Buffer The string describing the kernel.
/// @param Entry The name of the kernel function to call.
///
/// @returns A pointer to a kernel object
Value *createCallGetKernel(Value *Buffer, Value *Entry);
/// Create a call to free a GPU kernel.
///
/// @param GPUKernel THe kernel to free.
void createCallFreeKernel(Value *GPUKernel);
}; };
void GPUNodeBuilder::initializeAfterRTH() { void GPUNodeBuilder::initializeAfterRTH() {
@ -360,6 +375,41 @@ void GPUNodeBuilder::freeDeviceArrays() {
createCallFreeDeviceMemory(Array.second); createCallFreeDeviceMemory(Array.second);
} }
Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
const char *Name = "polly_getKernel";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(Builder.getInt8PtrTy());
Args.push_back(Builder.getInt8PtrTy());
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
return Builder.CreateCall(F, {Buffer, Entry});
}
void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
const char *Name = "polly_freeKernel";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(Builder.getInt8PtrTy());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall(F, {GPUKernel});
}
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
const char *Name = "polly_freeDeviceMemory"; const char *Name = "polly_freeDeviceMemory";
Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@ -755,7 +805,12 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
LocalArrays.clear(); LocalArrays.clear();
finalizeKernelFunction(); std::string ASMString = finalizeKernelFunction();
std::string Name = "kernel_" + std::to_string(Kernel->id);
Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
Value *GPUKernel = createCallGetKernel(KernelString, NameString);
createCallFreeKernel(GPUKernel);
} }
/// Compute the DataLayout string for the NVPTX backend. /// Compute the DataLayout string for the NVPTX backend.
@ -943,7 +998,7 @@ std::string GPUNodeBuilder::createKernelASM() {
return ASMStream.str(); return ASMStream.str();
} }
void GPUNodeBuilder::finalizeKernelFunction() { std::string GPUNodeBuilder::finalizeKernelFunction() {
// Verify module. // Verify module.
llvm::legacy::PassManager Passes; llvm::legacy::PassManager Passes;
Passes.add(createVerifierPass()); Passes.add(createVerifierPass());
@ -967,6 +1022,8 @@ void GPUNodeBuilder::finalizeKernelFunction() {
GPUModule.release(); GPUModule.release();
KernelIDs.clear(); KernelIDs.clear();
return Assembly;
} }
namespace { namespace {

View File

@ -96,6 +96,8 @@
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304) ; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8* ; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304) ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
; IR-NEXT: call i8* @polly_getKernel
; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8* ; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304) ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A) ; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)

View File

@ -30,6 +30,8 @@
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader ; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] ; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
; IR-NEXT: call i8* @polly_getKernel
; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 ; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98 ; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit ; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit

View File

@ -17,6 +17,7 @@
#include <dlfcn.h> #include <dlfcn.h>
#include <stdarg.h> #include <stdarg.h>
#include <stdio.h> #include <stdio.h>
#include <string.h>
static int DebugMode; static int DebugMode;
@ -36,12 +37,9 @@ struct PollyGPUContextT {
CUcontext Cuda; CUcontext Cuda;
}; };
struct PollyGPUModuleT {
CUmodule Cuda;
};
struct PollyGPUFunctionT { struct PollyGPUFunctionT {
CUfunction Cuda; CUfunction Cuda;
CUmodule CudaModule;
}; };
struct PollyGPUDevicePtrT { struct PollyGPUDevicePtrT {
@ -101,6 +99,10 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
void **); void **);
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
const void *image);
static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
const char *); const char *);
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr; static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
@ -111,6 +113,27 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
CUjitInputType type, void *data,
size_t size, const char *name,
unsigned int numOptions,
CUjit_option *options,
void **optionValues);
static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
CUjit_option *options,
void **optionValues,
CUlinkState *stateOut);
static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
size_t *sizeOut);
static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
/* Type-defines of function pointer ot CUDA runtime APIs. */ /* Type-defines of function pointer ot CUDA runtime APIs. */
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
@ -198,6 +221,9 @@ static int initialDeviceAPIs() {
CuModuleLoadDataExFcnPtr = CuModuleLoadDataExFcnPtr =
(CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
CuModuleLoadDataFcnPtr =
(CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
HandleCuda, "cuModuleGetFunction"); HandleCuda, "cuModuleGetFunction");
@ -208,6 +234,18 @@ static int initialDeviceAPIs() {
CuDeviceGetNameFcnPtr = CuDeviceGetNameFcnPtr =
(CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
CuLinkAddDataFcnPtr =
(CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
CuLinkCreateFcnPtr =
(CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
CuLinkCompleteFcnPtr =
(CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
CuLinkDestroyFcnPtr =
(CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
/* Get function pointer to CUDA Runtime APIs. */ /* Get function pointer to CUDA Runtime APIs. */
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
HandleCudaRT, "cudaThreadSynchronize"); HandleCudaRT, "cudaThreadSynchronize");
@ -262,38 +300,93 @@ PollyGPUContext *polly_initContext() {
return Context; return Context;
} }
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) { PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
const char *KernelName) {
dump_function(); dump_function();
*Module = malloc(sizeof(PollyGPUModule)); PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
if (*Module == 0) {
fprintf(stdout, "Allocate memory for Polly GPU module failed.\n"); if (Function == 0) {
fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
exit(-1); exit(-1);
} }
if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) != CUresult Res;
CUDA_SUCCESS) { CUlinkState LState;
CUjit_option Options[6];
void *OptionVals[6];
float Walltime = 0;
unsigned long LogSize = 8192;
char ErrorLog[8192], InfoLog[8192];
void *CuOut;
size_t OutSize;
// Setup linker options
// Return walltime from JIT compilation
Options[0] = CU_JIT_WALL_TIME;
OptionVals[0] = (void *)&Walltime;
// Pass a buffer for info messages
Options[1] = CU_JIT_INFO_LOG_BUFFER;
OptionVals[1] = (void *)InfoLog;
// Pass the size of the info buffer
Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
OptionVals[2] = (void *)LogSize;
// Pass a buffer for error message
Options[3] = CU_JIT_ERROR_LOG_BUFFER;
OptionVals[3] = (void *)ErrorLog;
// Pass the size of the error buffer
Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
OptionVals[4] = (void *)LogSize;
// Make the linker verbose
Options[5] = CU_JIT_LOG_VERBOSE;
OptionVals[5] = (void *)1;
memset(ErrorLog, 0, sizeof(ErrorLog));
CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
strlen(PTXBuffer) + 1, 0, 0, 0, 0);
if (Res != CUDA_SUCCESS) {
fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
exit(-1);
}
Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
if (Res != CUDA_SUCCESS) {
fprintf(stdout, "Complete ptx linker step failed.\n");
fprintf(stdout, "\n%s\n", ErrorLog);
exit(-1);
}
debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
InfoLog);
Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
if (Res != CUDA_SUCCESS) {
fprintf(stdout, "Loading ptx assembly text failed.\n"); fprintf(stdout, "Loading ptx assembly text failed.\n");
exit(-1); exit(-1);
} }
}
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module, Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
PollyGPUFunction **Kernel) { KernelName);
dump_function(); if (Res != CUDA_SUCCESS) {
*Kernel = malloc(sizeof(PollyGPUFunction));
if (*Kernel == 0) {
fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
exit(-1);
}
/* Locate the kernel entry point. */
if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) !=
CUDA_SUCCESS) {
fprintf(stdout, "Loading kernel function failed.\n"); fprintf(stdout, "Loading kernel function failed.\n");
exit(-1); exit(-1);
} }
CuLinkDestroyFcnPtr(LState);
return Function;
}
void polly_freeKernel(PollyGPUFunction *Kernel) {
dump_function();
if (Kernel->CudaModule)
CuModuleUnloadFcnPtr(Kernel->CudaModule);
if (Kernel)
free(Kernel);
} }
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,

View File

@ -44,7 +44,6 @@
* const char *Entry = "_Z8myKernelPi"; * const char *Entry = "_Z8myKernelPi";
* *
* int main() { * int main() {
* PollyGPUModule *Module;
* PollyGPUFunction *Kernel; * PollyGPUFunction *Kernel;
* PollyGPUContext *Context; * PollyGPUContext *Context;
* PollyGPUDevicePtr *DevArray; * PollyGPUDevicePtr *DevArray;
@ -58,11 +57,11 @@
* MemSize = 256*64*sizeof(int); * MemSize = 256*64*sizeof(int);
* Context = polly_initContext(); * Context = polly_initContext();
* DevArray = polly_allocateMemoryForDevice(MemSize); * DevArray = polly_allocateMemoryForDevice(MemSize);
* polly_getPTXModule(KernelString, &Module); * Kernel = polly_getKernel(KernelString, KernelName);
* polly_getPTXKernelEntry(Entry, Module, &Kernel);
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData); * polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
* polly_launchKernel(Kernel, GridWidth, GridHeight); * polly_launchKernel(Kernel, GridWidth, GridHeight);
* polly_copyFromDeviceToHost(HostData, DevData, MemSize); * polly_copyFromDeviceToHost(HostData, DevData, MemSize);
* polly_freeKernel(Kernel);
* polly_freeDeviceMemory(DevArray); * polly_freeDeviceMemory(DevArray);
* polly_freeContext(Context); * polly_freeContext(Context);
* } * }
@ -70,14 +69,13 @@
*/ */
typedef struct PollyGPUContextT PollyGPUContext; typedef struct PollyGPUContextT PollyGPUContext;
typedef struct PollyGPUModuleT PollyGPUModule;
typedef struct PollyGPUFunctionT PollyGPUFunction; typedef struct PollyGPUFunctionT PollyGPUFunction;
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
PollyGPUContext *polly_initContext(); PollyGPUContext *polly_initContext();
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module); PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module, const char *KernelName);
PollyGPUFunction **Kernel); void polly_freeKernel(PollyGPUFunction *Kernel);
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
long MemSize); long MemSize);
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,