GPGPU: Load GPU kernels

We embed the PTX code into the host IR as a global variable and compile it at run-time into a GPU kernel. llvm-svn: 276645
2024-12-27 02:09:54 +00:00 · 2016-07-25 16:31:21 +00:00 · 2016-07-25 16:31:21 +00:00 · 5779359624
commit 5779359624
parent 97de034e18
5 changed files with 186 additions and 34 deletions
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -281,7 +281,9 @@ private:
  ///
  /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
  /// dump its IR to stderr.
-  void finalizeKernelFunction();
+  ///
+  /// @returns The Assembly string of the kernel.
+  std::string finalizeKernelFunction();

  /// Create code that allocates memory to store arrays on device.
  void allocateDeviceArrays();
@ -324,6 +326,19 @@ private:
  /// @param HostPtr A host pointer specifying the location to copy to.
  void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
                                      Value *Size);
+
+  /// Create a call to get a kernel from an assembly string.
+  ///
+  /// @param Buffer The string describing the kernel.
+  /// @param Entry  The name of the kernel function to call.
+  ///
+  /// @returns A pointer to a kernel object
+  Value *createCallGetKernel(Value *Buffer, Value *Entry);
+
+  /// Create a call to free a GPU kernel.
+  ///
+  /// @param GPUKernel THe kernel to free.
+  void createCallFreeKernel(Value *GPUKernel);
 };

 void GPUNodeBuilder::initializeAfterRTH() {
@ -360,6 +375,41 @@ void GPUNodeBuilder::freeDeviceArrays() {
    createCallFreeDeviceMemory(Array.second);
 }

+Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
+  const char *Name = "polly_getKernel";
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type *> Args;
+    Args.push_back(Builder.getInt8PtrTy());
+    Args.push_back(Builder.getInt8PtrTy());
+    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  return Builder.CreateCall(F, {Buffer, Entry});
+}
+
+void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
+  const char *Name = "polly_freeKernel";
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type *> Args;
+    Args.push_back(Builder.getInt8PtrTy());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall(F, {GPUKernel});
+}
+
 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
  const char *Name = "polly_freeDeviceMemory";
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@ -755,7 +805,12 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
    S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
  LocalArrays.clear();

-  finalizeKernelFunction();
+  std::string ASMString = finalizeKernelFunction();
+  std::string Name = "kernel_" + std::to_string(Kernel->id);
+  Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
+  Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
+  Value *GPUKernel = createCallGetKernel(KernelString, NameString);
+  createCallFreeKernel(GPUKernel);
 }

 /// Compute the DataLayout string for the NVPTX backend.
@ -943,7 +998,7 @@ std::string GPUNodeBuilder::createKernelASM() {
  return ASMStream.str();
 }

-void GPUNodeBuilder::finalizeKernelFunction() {
+std::string GPUNodeBuilder::finalizeKernelFunction() {
  // Verify module.
  llvm::legacy::PassManager Passes;
  Passes.add(createVerifierPass());
@ -967,6 +1022,8 @@ void GPUNodeBuilder::finalizeKernelFunction() {

  GPUModule.release();
  KernelIDs.clear();
+
+  return Assembly;
 }

 namespace {
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@ -96,6 +96,8 @@
 ; IR-NEXT:    %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
 ; IR-NEXT:    [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
 ; IR-NEXT:    call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
+; IR-NEXT:    call i8* @polly_getKernel
+; IR-NEXT:    call void @polly_freeKernel
 ; IR-NEXT:    [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
 ; IR-NEXT:    call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
 ; IR-NEXT:    call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@ -30,6 +30,8 @@

 ; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
 ; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
+; IR-NEXT: call i8* @polly_getKernel
+; IR-NEXT: call void @polly_freeKernel
 ; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
 ; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 98
 ; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@ -17,6 +17,7 @@
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <string.h>

 static int DebugMode;

@ -36,12 +37,9 @@ struct PollyGPUContextT {
  CUcontext Cuda;
 };

-struct PollyGPUModuleT {
-  CUmodule Cuda;
-};
-
 struct PollyGPUFunctionT {
  CUfunction Cuda;
+  CUmodule CudaModule;
 };

 struct PollyGPUDevicePtrT {
@ -101,6 +99,10 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
                                                 void **);
 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;

+typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
+                                               const void *image);
+static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
+
 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
                                                  const char *);
 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
@ -111,6 +113,27 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;

+typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
+                                            CUjitInputType type, void *data,
+                                            size_t size, const char *name,
+                                            unsigned int numOptions,
+                                            CUjit_option *options,
+                                            void **optionValues);
+static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
+                                           CUjit_option *options,
+                                           void **optionValues,
+                                           CUlinkState *stateOut);
+static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
+                                             size_t *sizeOut);
+static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
+static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
+
 /* Type-defines of function pointer ot CUDA runtime APIs. */
 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
@ -198,6 +221,9 @@ static int initialDeviceAPIs() {
  CuModuleLoadDataExFcnPtr =
      (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");

+  CuModuleLoadDataFcnPtr =
+      (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
+
  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
      HandleCuda, "cuModuleGetFunction");

@ -208,6 +234,18 @@ static int initialDeviceAPIs() {
  CuDeviceGetNameFcnPtr =
      (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");

+  CuLinkAddDataFcnPtr =
+      (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
+
+  CuLinkCreateFcnPtr =
+      (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
+
+  CuLinkCompleteFcnPtr =
+      (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
+
+  CuLinkDestroyFcnPtr =
+      (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
+
  /* Get function pointer to CUDA Runtime APIs. */
  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
      HandleCudaRT, "cudaThreadSynchronize");
@ -262,38 +300,93 @@ PollyGPUContext *polly_initContext() {
  return Context;
 }

-void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+                                  const char *KernelName) {
  dump_function();

-  *Module = malloc(sizeof(PollyGPUModule));
-  if (*Module == 0) {
-    fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
+  PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
+
+  if (Function == 0) {
+    fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
    exit(-1);
  }

-  if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) !=
-      CUDA_SUCCESS) {
+  CUresult Res;
+  CUlinkState LState;
+  CUjit_option Options[6];
+  void *OptionVals[6];
+  float Walltime = 0;
+  unsigned long LogSize = 8192;
+  char ErrorLog[8192], InfoLog[8192];
+  void *CuOut;
+  size_t OutSize;
+
+  // Setup linker options
+  // Return walltime from JIT compilation
+  Options[0] = CU_JIT_WALL_TIME;
+  OptionVals[0] = (void *)&Walltime;
+  // Pass a buffer for info messages
+  Options[1] = CU_JIT_INFO_LOG_BUFFER;
+  OptionVals[1] = (void *)InfoLog;
+  // Pass the size of the info buffer
+  Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[2] = (void *)LogSize;
+  // Pass a buffer for error message
+  Options[3] = CU_JIT_ERROR_LOG_BUFFER;
+  OptionVals[3] = (void *)ErrorLog;
+  // Pass the size of the error buffer
+  Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[4] = (void *)LogSize;
+  // Make the linker verbose
+  Options[5] = CU_JIT_LOG_VERBOSE;
+  OptionVals[5] = (void *)1;
+
+  memset(ErrorLog, 0, sizeof(ErrorLog));
+
+  CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
+  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
+                            strlen(PTXBuffer) + 1, 0, 0, 0, 0);
+  if (Res != CUDA_SUCCESS) {
+    fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
+    exit(-1);
+  }
+
+  Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
+  if (Res != CUDA_SUCCESS) {
+    fprintf(stdout, "Complete ptx linker step failed.\n");
+    fprintf(stdout, "\n%s\n", ErrorLog);
+    exit(-1);
+  }
+
+  debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
+              InfoLog);
+
+  Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
+  if (Res != CUDA_SUCCESS) {
    fprintf(stdout, "Loading ptx assembly text failed.\n");
    exit(-1);
  }
-}

-void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
-                             PollyGPUFunction **Kernel) {
-  dump_function();
-
-  *Kernel = malloc(sizeof(PollyGPUFunction));
-  if (*Kernel == 0) {
-    fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
-    exit(-1);
-  }
-
-  /* Locate the kernel entry point. */
-  if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) !=
-      CUDA_SUCCESS) {
+  Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
+                                  KernelName);
+  if (Res != CUDA_SUCCESS) {
    fprintf(stdout, "Loading kernel function failed.\n");
    exit(-1);
  }
+
+  CuLinkDestroyFcnPtr(LState);
+
+  return Function;
+}
+
+void polly_freeKernel(PollyGPUFunction *Kernel) {
+  dump_function();
+
+  if (Kernel->CudaModule)
+    CuModuleUnloadFcnPtr(Kernel->CudaModule);
+
+  if (Kernel)
+    free(Kernel);
 }

 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
--- a/polly/tools/GPURuntime/GPUJIT.h
+++ b/polly/tools/GPURuntime/GPUJIT.h
@ -44,7 +44,6 @@
 * const char *Entry = "_Z8myKernelPi";
 *
 * int main() {
- *   PollyGPUModule *Module;
 *   PollyGPUFunction *Kernel;
 *   PollyGPUContext *Context;
 *   PollyGPUDevicePtr *DevArray;
@ -58,11 +57,11 @@
 *   MemSize = 256*64*sizeof(int);
 *   Context = polly_initContext();
 *   DevArray = polly_allocateMemoryForDevice(MemSize);
- *   polly_getPTXModule(KernelString, &Module);
- *   polly_getPTXKernelEntry(Entry, Module, &Kernel);
+ *   Kernel = polly_getKernel(KernelString, KernelName);
 *   polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
 *   polly_launchKernel(Kernel, GridWidth, GridHeight);
 *   polly_copyFromDeviceToHost(HostData, DevData, MemSize);
+ *   polly_freeKernel(Kernel);
 *   polly_freeDeviceMemory(DevArray);
 *   polly_freeContext(Context);
 * }
@ -70,14 +69,13 @@
 */

 typedef struct PollyGPUContextT PollyGPUContext;
-typedef struct PollyGPUModuleT PollyGPUModule;
 typedef struct PollyGPUFunctionT PollyGPUFunction;
 typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;

 PollyGPUContext *polly_initContext();
-void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
-void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
-                             PollyGPUFunction **Kernel);
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+                                  const char *KernelName);
+void polly_freeKernel(PollyGPUFunction *Kernel);
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
                                long MemSize);
 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,