diff --git a/opencl/architecture.md b/opencl/architecture.md index bf80156..9f5125f 100644 --- a/opencl/architecture.md +++ b/opencl/architecture.md @@ -83,12 +83,14 @@ Contains private memory, which no other work item can see. TODO: can a single work item be run in parallel on the GPU? -## Local and Private memory +## Local memory -TODO: why use those at all instead of global memory? +## Private memory -- - +- + +TODO: can private memory be slower than local memory? Might be faster, and global memory is limited. diff --git a/opencl/clinfo.c b/opencl/clinfo.c index 4b1b603..b1d176a 100644 --- a/opencl/clinfo.c +++ b/opencl/clinfo.c @@ -16,11 +16,16 @@ https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.h clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \ printf(#id " = %ju\n", (uintmax_t)buf_cl_uint); +#define PRINT_CL_ULONG(id) \ + clGetDeviceInfo(device, CL_ ## id, sizeof(cl_ulong), &(buf_cl_ulong), NULL); \ + printf(#id " = 0x%lx\n", (uintmax_t)buf_cl_ulong); + int main(void) { - cl_platform_id platform; cl_device_id device; - size_t buf_size_t; + cl_platform_id platform; cl_uint buf_cl_uint; + cl_ulong buf_cl_ulong; + size_t buf_size_t; /* Setup. */ clGetPlatformIDs(1, &platform, NULL); @@ -33,6 +38,8 @@ int main(void) { /* TODO this is wrong, it is actually an array. * But yeah, likely the same for all dimensions. */ PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES) + PRINT_CL_ULONG(DEVICE_LOCAL_MEM_SIZE) + PRINT_CL_UINT(DEVICE_MAX_COMPUTE_UNITS) /* Cleanup. */ #ifdef CL_1_2 diff --git a/opencl/matmul.c b/opencl/matmul.c index 033ce28..7ffce8f 100644 --- a/opencl/matmul.c +++ b/opencl/matmul.c @@ -1,6 +1,9 @@ /* Matrix multiplication. +Based on the amazing: +https://github.com/HandsOnOpenCL/Exercises-Solutions/tree/a908ac3f0fadede29f2735eb1264b0db7f4311a0/Solutions/Exercise08 + The most basic / useful application where OpenCL might be faster than CPU. TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations: @@ -9,6 +12,7 @@ TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS C - http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c - https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++ - http://www.netlib.org/utk/papers/autoblock/node2.html +- http://stackoverflow.com/questions/25900312/optimizing-batched-matrix-multiplication-opencl-code */ #include "common.h" @@ -31,7 +35,7 @@ void mat_mul_cpu(const F *A, const F *B, F *C, size_t n) { } } -/* Simplest possible implementation. */ +/* Simplest possible CL implementation. No speedup. */ void mat_mul_cl(const F *A, const F *B, F *C, size_t n) { cl_mem buf_a, buf_b, buf_c; Common common; @@ -66,6 +70,41 @@ void mat_mul_cl(const F *A, const F *B, F *C, size_t n) { common_deinit(&common); } +/* Cache rows in private memory. Drastic speedups expected over naive CPU. */ +void mat_mul_cl_row(const F *A, const F *B, F *C, size_t n) { + cl_mem buf_a, buf_b, buf_c; + Common common; + cl_uint ncl; + size_t global_work_size[2], mat_sizeof, n2; + + /* Setup variables. */ + global_work_size[0] = n; + global_work_size[1] = n; + n2 = n * n; + mat_sizeof = n2 * sizeof(F); + ncl = n; + + /* Run kernel. */ + common_init_file(&common, "matmul_row.cl"); + buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL); + buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL); + buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL); + clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a); + clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b); + clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c); + clSetKernelArg(common.kernel, 3, sizeof(ncl), &ncl); + clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL); + clFlush(common.command_queue); + clFinish(common.command_queue); + clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); + + /* Cleanup. */ + clReleaseMemObject(buf_a); + clReleaseMemObject(buf_b); + clReleaseMemObject(buf_c); + common_deinit(&common); +} + /* Check if two matrices are equal with given mean squared err_maxor. */ int mat_eq(const F *A, const F *B, size_t n) { const F err_max = 10e-3; @@ -148,7 +187,7 @@ int main(void) { size_t n = 1, n2, a_sizeof; puts("#matmul"); - puts("n mat_mul_cpu mat_mul_cl"); + puts("n mat_mul_cpu mat_mul_cl mat_mul_cl_row"); while(1) { printf("%zu ", n); n2 = n * n; @@ -169,8 +208,12 @@ int main(void) { dt = common_get_nanos() - time; printf("%f ", dt); + /*time = common_get_nanos();*/ + /*mat_mul_cl(A, B, C, n);*/ + /*printf("%f ", common_get_nanos() - time);*/ + time = common_get_nanos(); - mat_mul_cl(A, B, C, n); + mat_mul_cl_row(A, B, C, n); printf("%f", common_get_nanos() - time); assert(mat_eq(C, C_ref, n)); diff --git a/opencl/matmul.cl b/opencl/matmul.cl index f3dda62..0b08996 100644 --- a/opencl/matmul.cl +++ b/opencl/matmul.cl @@ -4,9 +4,9 @@ __kernel void main( __global float *C, const uint N ) { - uint k; uint i = get_global_id(0); uint j = get_global_id(1); + uint k; float tmp; tmp = 0.0; diff --git a/opencl/matmul_row.cl b/opencl/matmul_row.cl new file mode 100644 index 0000000..0db3f6e --- /dev/null +++ b/opencl/matmul_row.cl @@ -0,0 +1,26 @@ +__kernel void main( + __global float* A, + __global float* B, + __global float* C, + const uint N +) +{ + uint i = get_global_id(0); + uint j, k; + /* TODO remove hardcoded constant? + * https://github.com/HandsOnOpenCL/Exercises-Solutions/issues/63 + * If I keep increasing, first: + * - asserts on CPU fail as in: http://stackoverflow.com/questions/22083507/is-there-a-maximum-limit-to-private-memory-in-opencl + * - CL complaing at compile time that this is too big and aborts + **/ + float Ap[0x100000]; + float tmp; + for (k = 0; k < N; k++) + Ap[k] = A[i*N+k]; + for (j = 0; j < N; j++) { + tmp = 0.0; + for (k = 0; k < N; k++) + tmp += Ap[k] * B[k*N+j]; + C[i*N+j] = tmp; + } +}