matmul row private

This commit is contained in:
Ciro Santilli 2017-03-14 10:18:51 +00:00
parent 15b6cb6e38
commit 3c68f19000
5 changed files with 87 additions and 9 deletions

View File

@ -83,12 +83,14 @@ Contains private memory, which no other work item can see.
TODO: can a single work item be run in parallel on the GPU?
## Local and Private memory
## Local memory
TODO: why use those at all instead of global memory?
## Private memory
- <http://stackoverflow.com/questions/21872810/whats-the-advantage-of-the-local-memory-in-opencl>
- <http://stackoverflow.com/questions/9885880/effect-of-private-memory-in-opencl>
- <http://stackoverflow.com/questions/9885880/is-private-memory-slower-than-local-memory>
TODO: can private memory be slower than local memory? <http://stackoverflow.com/questions/21872810/whats-the-advantage-of-the-local-memory-in-opencl>
Might be faster, and global memory is limited.

View File

@ -16,11 +16,16 @@ https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.h
clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \
printf(#id " = %ju\n", (uintmax_t)buf_cl_uint);
#define PRINT_CL_ULONG(id) \
clGetDeviceInfo(device, CL_ ## id, sizeof(cl_ulong), &(buf_cl_ulong), NULL); \
printf(#id " = 0x%lx\n", (uintmax_t)buf_cl_ulong);
int main(void) {
cl_platform_id platform;
cl_device_id device;
size_t buf_size_t;
cl_platform_id platform;
cl_uint buf_cl_uint;
cl_ulong buf_cl_ulong;
size_t buf_size_t;
/* Setup. */
clGetPlatformIDs(1, &platform, NULL);
@ -33,6 +38,8 @@ int main(void) {
/* TODO this is wrong, it is actually an array.
* But yeah, likely the same for all dimensions. */
PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
PRINT_CL_ULONG(DEVICE_LOCAL_MEM_SIZE)
PRINT_CL_UINT(DEVICE_MAX_COMPUTE_UNITS)
/* Cleanup. */
#ifdef CL_1_2

View File

@ -1,6 +1,9 @@
/*
Matrix multiplication.
Based on the amazing:
https://github.com/HandsOnOpenCL/Exercises-Solutions/tree/a908ac3f0fadede29f2735eb1264b0db7f4311a0/Solutions/Exercise08
The most basic / useful application where OpenCL might be faster than CPU.
TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations:
@ -9,6 +12,7 @@ TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS C
- http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
- https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
- http://www.netlib.org/utk/papers/autoblock/node2.html
- http://stackoverflow.com/questions/25900312/optimizing-batched-matrix-multiplication-opencl-code
*/
#include "common.h"
@ -31,7 +35,7 @@ void mat_mul_cpu(const F *A, const F *B, F *C, size_t n) {
}
}
/* Simplest possible implementation. */
/* Simplest possible CL implementation. No speedup. */
void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
cl_mem buf_a, buf_b, buf_c;
Common common;
@ -66,6 +70,41 @@ void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
common_deinit(&common);
}
/* Cache rows in private memory. Drastic speedups expected over naive CPU. */
void mat_mul_cl_row(const F *A, const F *B, F *C, size_t n) {
cl_mem buf_a, buf_b, buf_c;
Common common;
cl_uint ncl;
size_t global_work_size[2], mat_sizeof, n2;
/* Setup variables. */
global_work_size[0] = n;
global_work_size[1] = n;
n2 = n * n;
mat_sizeof = n2 * sizeof(F);
ncl = n;
/* Run kernel. */
common_init_file(&common, "matmul_row.cl");
buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL);
buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL);
clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a);
clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
clSetKernelArg(common.kernel, 3, sizeof(ncl), &ncl);
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
clFlush(common.command_queue);
clFinish(common.command_queue);
clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
/* Cleanup. */
clReleaseMemObject(buf_a);
clReleaseMemObject(buf_b);
clReleaseMemObject(buf_c);
common_deinit(&common);
}
/* Check if two matrices are equal with given mean squared err_maxor. */
int mat_eq(const F *A, const F *B, size_t n) {
const F err_max = 10e-3;
@ -148,7 +187,7 @@ int main(void) {
size_t n = 1, n2, a_sizeof;
puts("#matmul");
puts("n mat_mul_cpu mat_mul_cl");
puts("n mat_mul_cpu mat_mul_cl mat_mul_cl_row");
while(1) {
printf("%zu ", n);
n2 = n * n;
@ -169,8 +208,12 @@ int main(void) {
dt = common_get_nanos() - time;
printf("%f ", dt);
/*time = common_get_nanos();*/
/*mat_mul_cl(A, B, C, n);*/
/*printf("%f ", common_get_nanos() - time);*/
time = common_get_nanos();
mat_mul_cl(A, B, C, n);
mat_mul_cl_row(A, B, C, n);
printf("%f", common_get_nanos() - time);
assert(mat_eq(C, C_ref, n));

View File

@ -4,9 +4,9 @@ __kernel void main(
__global float *C,
const uint N
) {
uint k;
uint i = get_global_id(0);
uint j = get_global_id(1);
uint k;
float tmp;
tmp = 0.0;

26
opencl/matmul_row.cl Normal file
View File

@ -0,0 +1,26 @@
__kernel void main(
__global float* A,
__global float* B,
__global float* C,
const uint N
)
{
uint i = get_global_id(0);
uint j, k;
/* TODO remove hardcoded constant?
* https://github.com/HandsOnOpenCL/Exercises-Solutions/issues/63
* If I keep increasing, first:
* - asserts on CPU fail as in: http://stackoverflow.com/questions/22083507/is-there-a-maximum-limit-to-private-memory-in-opencl
* - CL complaing at compile time that this is too big and aborts
**/
float Ap[0x100000];
float tmp;
for (k = 0; k < N; k++)
Ap[k] = A[i*N+k];
for (j = 0; j < N; j++) {
tmp = 0.0;
for (k = 0; k < N; k++)
tmp += Ap[k] * B[k*N+j];
C[i*N+j] = tmp;
}
}