mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-01-14 03:45:33 +00:00
GPGPU: Load GPU kernels
We embed the PTX code into the host IR as a global variable and compile it at run-time into a GPU kernel. llvm-svn: 276645
This commit is contained in:
parent
97de034e18
commit
5779359624
@ -281,7 +281,9 @@ private:
|
|||||||
///
|
///
|
||||||
/// Free the LLVM-IR module corresponding to the kernel and -- if requested --
|
/// Free the LLVM-IR module corresponding to the kernel and -- if requested --
|
||||||
/// dump its IR to stderr.
|
/// dump its IR to stderr.
|
||||||
void finalizeKernelFunction();
|
///
|
||||||
|
/// @returns The Assembly string of the kernel.
|
||||||
|
std::string finalizeKernelFunction();
|
||||||
|
|
||||||
/// Create code that allocates memory to store arrays on device.
|
/// Create code that allocates memory to store arrays on device.
|
||||||
void allocateDeviceArrays();
|
void allocateDeviceArrays();
|
||||||
@ -324,6 +326,19 @@ private:
|
|||||||
/// @param HostPtr A host pointer specifying the location to copy to.
|
/// @param HostPtr A host pointer specifying the location to copy to.
|
||||||
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
|
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
|
||||||
Value *Size);
|
Value *Size);
|
||||||
|
|
||||||
|
/// Create a call to get a kernel from an assembly string.
|
||||||
|
///
|
||||||
|
/// @param Buffer The string describing the kernel.
|
||||||
|
/// @param Entry The name of the kernel function to call.
|
||||||
|
///
|
||||||
|
/// @returns A pointer to a kernel object
|
||||||
|
Value *createCallGetKernel(Value *Buffer, Value *Entry);
|
||||||
|
|
||||||
|
/// Create a call to free a GPU kernel.
|
||||||
|
///
|
||||||
|
/// @param GPUKernel THe kernel to free.
|
||||||
|
void createCallFreeKernel(Value *GPUKernel);
|
||||||
};
|
};
|
||||||
|
|
||||||
void GPUNodeBuilder::initializeAfterRTH() {
|
void GPUNodeBuilder::initializeAfterRTH() {
|
||||||
@ -360,6 +375,41 @@ void GPUNodeBuilder::freeDeviceArrays() {
|
|||||||
createCallFreeDeviceMemory(Array.second);
|
createCallFreeDeviceMemory(Array.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
|
||||||
|
const char *Name = "polly_getKernel";
|
||||||
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type *> Args;
|
||||||
|
Args.push_back(Builder.getInt8PtrTy());
|
||||||
|
Args.push_back(Builder.getInt8PtrTy());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Builder.CreateCall(F, {Buffer, Entry});
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
|
||||||
|
const char *Name = "polly_freeKernel";
|
||||||
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type *> Args;
|
||||||
|
Args.push_back(Builder.getInt8PtrTy());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall(F, {GPUKernel});
|
||||||
|
}
|
||||||
|
|
||||||
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
|
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
|
||||||
const char *Name = "polly_freeDeviceMemory";
|
const char *Name = "polly_freeDeviceMemory";
|
||||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||||
@ -755,7 +805,12 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
|||||||
S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
|
S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
|
||||||
LocalArrays.clear();
|
LocalArrays.clear();
|
||||||
|
|
||||||
finalizeKernelFunction();
|
std::string ASMString = finalizeKernelFunction();
|
||||||
|
std::string Name = "kernel_" + std::to_string(Kernel->id);
|
||||||
|
Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
|
||||||
|
Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
|
||||||
|
Value *GPUKernel = createCallGetKernel(KernelString, NameString);
|
||||||
|
createCallFreeKernel(GPUKernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compute the DataLayout string for the NVPTX backend.
|
/// Compute the DataLayout string for the NVPTX backend.
|
||||||
@ -943,7 +998,7 @@ std::string GPUNodeBuilder::createKernelASM() {
|
|||||||
return ASMStream.str();
|
return ASMStream.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPUNodeBuilder::finalizeKernelFunction() {
|
std::string GPUNodeBuilder::finalizeKernelFunction() {
|
||||||
// Verify module.
|
// Verify module.
|
||||||
llvm::legacy::PassManager Passes;
|
llvm::legacy::PassManager Passes;
|
||||||
Passes.add(createVerifierPass());
|
Passes.add(createVerifierPass());
|
||||||
@ -967,6 +1022,8 @@ void GPUNodeBuilder::finalizeKernelFunction() {
|
|||||||
|
|
||||||
GPUModule.release();
|
GPUModule.release();
|
||||||
KernelIDs.clear();
|
KernelIDs.clear();
|
||||||
|
|
||||||
|
return Assembly;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
@ -96,6 +96,8 @@
|
|||||||
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
|
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
|
||||||
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
|
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||||
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
|
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
|
||||||
|
; IR-NEXT: call i8* @polly_getKernel
|
||||||
|
; IR-NEXT: call void @polly_freeKernel
|
||||||
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
|
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||||
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
|
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
|
||||||
; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
|
; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
|
||||||
|
@ -30,6 +30,8 @@
|
|||||||
|
|
||||||
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
|
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
|
||||||
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
|
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
|
||||||
|
; IR-NEXT: call i8* @polly_getKernel
|
||||||
|
; IR-NEXT: call void @polly_freeKernel
|
||||||
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
|
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
|
||||||
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
|
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
|
||||||
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
|
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
static int DebugMode;
|
static int DebugMode;
|
||||||
|
|
||||||
@ -36,12 +37,9 @@ struct PollyGPUContextT {
|
|||||||
CUcontext Cuda;
|
CUcontext Cuda;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct PollyGPUModuleT {
|
|
||||||
CUmodule Cuda;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct PollyGPUFunctionT {
|
struct PollyGPUFunctionT {
|
||||||
CUfunction Cuda;
|
CUfunction Cuda;
|
||||||
|
CUmodule CudaModule;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct PollyGPUDevicePtrT {
|
struct PollyGPUDevicePtrT {
|
||||||
@ -101,6 +99,10 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
|
|||||||
void **);
|
void **);
|
||||||
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
|
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
|
||||||
|
|
||||||
|
typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
|
||||||
|
const void *image);
|
||||||
|
static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
|
||||||
|
|
||||||
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
|
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
|
||||||
const char *);
|
const char *);
|
||||||
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
|
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
|
||||||
@ -111,6 +113,27 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
|
|||||||
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
|
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
|
||||||
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
|
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
|
||||||
|
|
||||||
|
typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
|
||||||
|
CUjitInputType type, void *data,
|
||||||
|
size_t size, const char *name,
|
||||||
|
unsigned int numOptions,
|
||||||
|
CUjit_option *options,
|
||||||
|
void **optionValues);
|
||||||
|
static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
|
||||||
|
|
||||||
|
typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
|
||||||
|
CUjit_option *options,
|
||||||
|
void **optionValues,
|
||||||
|
CUlinkState *stateOut);
|
||||||
|
static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
|
||||||
|
|
||||||
|
typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
|
||||||
|
size_t *sizeOut);
|
||||||
|
static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
|
||||||
|
|
||||||
|
typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
|
||||||
|
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
|
||||||
|
|
||||||
/* Type-defines of function pointer ot CUDA runtime APIs. */
|
/* Type-defines of function pointer ot CUDA runtime APIs. */
|
||||||
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
|
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
|
||||||
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
|
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
|
||||||
@ -198,6 +221,9 @@ static int initialDeviceAPIs() {
|
|||||||
CuModuleLoadDataExFcnPtr =
|
CuModuleLoadDataExFcnPtr =
|
||||||
(CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
|
(CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
|
||||||
|
|
||||||
|
CuModuleLoadDataFcnPtr =
|
||||||
|
(CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
|
||||||
|
|
||||||
CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
|
CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
|
||||||
HandleCuda, "cuModuleGetFunction");
|
HandleCuda, "cuModuleGetFunction");
|
||||||
|
|
||||||
@ -208,6 +234,18 @@ static int initialDeviceAPIs() {
|
|||||||
CuDeviceGetNameFcnPtr =
|
CuDeviceGetNameFcnPtr =
|
||||||
(CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
|
(CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
|
||||||
|
|
||||||
|
CuLinkAddDataFcnPtr =
|
||||||
|
(CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
|
||||||
|
|
||||||
|
CuLinkCreateFcnPtr =
|
||||||
|
(CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
|
||||||
|
|
||||||
|
CuLinkCompleteFcnPtr =
|
||||||
|
(CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
|
||||||
|
|
||||||
|
CuLinkDestroyFcnPtr =
|
||||||
|
(CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
|
||||||
|
|
||||||
/* Get function pointer to CUDA Runtime APIs. */
|
/* Get function pointer to CUDA Runtime APIs. */
|
||||||
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
|
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
|
||||||
HandleCudaRT, "cudaThreadSynchronize");
|
HandleCudaRT, "cudaThreadSynchronize");
|
||||||
@ -262,38 +300,93 @@ PollyGPUContext *polly_initContext() {
|
|||||||
return Context;
|
return Context;
|
||||||
}
|
}
|
||||||
|
|
||||||
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
|
PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
|
||||||
|
const char *KernelName) {
|
||||||
dump_function();
|
dump_function();
|
||||||
|
|
||||||
*Module = malloc(sizeof(PollyGPUModule));
|
PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
|
||||||
if (*Module == 0) {
|
|
||||||
fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
|
if (Function == 0) {
|
||||||
|
fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) !=
|
CUresult Res;
|
||||||
CUDA_SUCCESS) {
|
CUlinkState LState;
|
||||||
|
CUjit_option Options[6];
|
||||||
|
void *OptionVals[6];
|
||||||
|
float Walltime = 0;
|
||||||
|
unsigned long LogSize = 8192;
|
||||||
|
char ErrorLog[8192], InfoLog[8192];
|
||||||
|
void *CuOut;
|
||||||
|
size_t OutSize;
|
||||||
|
|
||||||
|
// Setup linker options
|
||||||
|
// Return walltime from JIT compilation
|
||||||
|
Options[0] = CU_JIT_WALL_TIME;
|
||||||
|
OptionVals[0] = (void *)&Walltime;
|
||||||
|
// Pass a buffer for info messages
|
||||||
|
Options[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||||
|
OptionVals[1] = (void *)InfoLog;
|
||||||
|
// Pass the size of the info buffer
|
||||||
|
Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||||
|
OptionVals[2] = (void *)LogSize;
|
||||||
|
// Pass a buffer for error message
|
||||||
|
Options[3] = CU_JIT_ERROR_LOG_BUFFER;
|
||||||
|
OptionVals[3] = (void *)ErrorLog;
|
||||||
|
// Pass the size of the error buffer
|
||||||
|
Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
||||||
|
OptionVals[4] = (void *)LogSize;
|
||||||
|
// Make the linker verbose
|
||||||
|
Options[5] = CU_JIT_LOG_VERBOSE;
|
||||||
|
OptionVals[5] = (void *)1;
|
||||||
|
|
||||||
|
memset(ErrorLog, 0, sizeof(ErrorLog));
|
||||||
|
|
||||||
|
CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
|
||||||
|
Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
|
||||||
|
strlen(PTXBuffer) + 1, 0, 0, 0, 0);
|
||||||
|
if (Res != CUDA_SUCCESS) {
|
||||||
|
fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
|
||||||
|
if (Res != CUDA_SUCCESS) {
|
||||||
|
fprintf(stdout, "Complete ptx linker step failed.\n");
|
||||||
|
fprintf(stdout, "\n%s\n", ErrorLog);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
|
||||||
|
InfoLog);
|
||||||
|
|
||||||
|
Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
|
||||||
|
if (Res != CUDA_SUCCESS) {
|
||||||
fprintf(stdout, "Loading ptx assembly text failed.\n");
|
fprintf(stdout, "Loading ptx assembly text failed.\n");
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
|
||||||
PollyGPUFunction **Kernel) {
|
KernelName);
|
||||||
dump_function();
|
if (Res != CUDA_SUCCESS) {
|
||||||
|
|
||||||
*Kernel = malloc(sizeof(PollyGPUFunction));
|
|
||||||
if (*Kernel == 0) {
|
|
||||||
fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Locate the kernel entry point. */
|
|
||||||
if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) !=
|
|
||||||
CUDA_SUCCESS) {
|
|
||||||
fprintf(stdout, "Loading kernel function failed.\n");
|
fprintf(stdout, "Loading kernel function failed.\n");
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CuLinkDestroyFcnPtr(LState);
|
||||||
|
|
||||||
|
return Function;
|
||||||
|
}
|
||||||
|
|
||||||
|
void polly_freeKernel(PollyGPUFunction *Kernel) {
|
||||||
|
dump_function();
|
||||||
|
|
||||||
|
if (Kernel->CudaModule)
|
||||||
|
CuModuleUnloadFcnPtr(Kernel->CudaModule);
|
||||||
|
|
||||||
|
if (Kernel)
|
||||||
|
free(Kernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
||||||
|
@ -44,7 +44,6 @@
|
|||||||
* const char *Entry = "_Z8myKernelPi";
|
* const char *Entry = "_Z8myKernelPi";
|
||||||
*
|
*
|
||||||
* int main() {
|
* int main() {
|
||||||
* PollyGPUModule *Module;
|
|
||||||
* PollyGPUFunction *Kernel;
|
* PollyGPUFunction *Kernel;
|
||||||
* PollyGPUContext *Context;
|
* PollyGPUContext *Context;
|
||||||
* PollyGPUDevicePtr *DevArray;
|
* PollyGPUDevicePtr *DevArray;
|
||||||
@ -58,11 +57,11 @@
|
|||||||
* MemSize = 256*64*sizeof(int);
|
* MemSize = 256*64*sizeof(int);
|
||||||
* Context = polly_initContext();
|
* Context = polly_initContext();
|
||||||
* DevArray = polly_allocateMemoryForDevice(MemSize);
|
* DevArray = polly_allocateMemoryForDevice(MemSize);
|
||||||
* polly_getPTXModule(KernelString, &Module);
|
* Kernel = polly_getKernel(KernelString, KernelName);
|
||||||
* polly_getPTXKernelEntry(Entry, Module, &Kernel);
|
|
||||||
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
|
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
|
||||||
* polly_launchKernel(Kernel, GridWidth, GridHeight);
|
* polly_launchKernel(Kernel, GridWidth, GridHeight);
|
||||||
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
|
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
|
||||||
|
* polly_freeKernel(Kernel);
|
||||||
* polly_freeDeviceMemory(DevArray);
|
* polly_freeDeviceMemory(DevArray);
|
||||||
* polly_freeContext(Context);
|
* polly_freeContext(Context);
|
||||||
* }
|
* }
|
||||||
@ -70,14 +69,13 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
typedef struct PollyGPUContextT PollyGPUContext;
|
typedef struct PollyGPUContextT PollyGPUContext;
|
||||||
typedef struct PollyGPUModuleT PollyGPUModule;
|
|
||||||
typedef struct PollyGPUFunctionT PollyGPUFunction;
|
typedef struct PollyGPUFunctionT PollyGPUFunction;
|
||||||
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
|
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
|
||||||
|
|
||||||
PollyGPUContext *polly_initContext();
|
PollyGPUContext *polly_initContext();
|
||||||
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
|
PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
|
||||||
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
const char *KernelName);
|
||||||
PollyGPUFunction **Kernel);
|
void polly_freeKernel(PollyGPUFunction *Kernel);
|
||||||
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
||||||
long MemSize);
|
long MemSize);
|
||||||
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user