mirror of
https://github.com/libretro/cpp-cheat.git
synced 2025-04-03 03:12:24 +00:00
matmul row private
This commit is contained in:
parent
15b6cb6e38
commit
3c68f19000
@ -83,12 +83,14 @@ Contains private memory, which no other work item can see.
|
||||
|
||||
TODO: can a single work item be run in parallel on the GPU?
|
||||
|
||||
## Local and Private memory
|
||||
## Local memory
|
||||
|
||||
TODO: why use those at all instead of global memory?
|
||||
## Private memory
|
||||
|
||||
- <http://stackoverflow.com/questions/21872810/whats-the-advantage-of-the-local-memory-in-opencl>
|
||||
- <http://stackoverflow.com/questions/9885880/effect-of-private-memory-in-opencl>
|
||||
- <http://stackoverflow.com/questions/9885880/is-private-memory-slower-than-local-memory>
|
||||
|
||||
TODO: can private memory be slower than local memory? <http://stackoverflow.com/questions/21872810/whats-the-advantage-of-the-local-memory-in-opencl>
|
||||
|
||||
Might be faster, and global memory is limited.
|
||||
|
||||
|
@ -16,11 +16,16 @@ https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.h
|
||||
clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \
|
||||
printf(#id " = %ju\n", (uintmax_t)buf_cl_uint);
|
||||
|
||||
#define PRINT_CL_ULONG(id) \
|
||||
clGetDeviceInfo(device, CL_ ## id, sizeof(cl_ulong), &(buf_cl_ulong), NULL); \
|
||||
printf(#id " = 0x%lx\n", (uintmax_t)buf_cl_ulong);
|
||||
|
||||
int main(void) {
|
||||
cl_platform_id platform;
|
||||
cl_device_id device;
|
||||
size_t buf_size_t;
|
||||
cl_platform_id platform;
|
||||
cl_uint buf_cl_uint;
|
||||
cl_ulong buf_cl_ulong;
|
||||
size_t buf_size_t;
|
||||
|
||||
/* Setup. */
|
||||
clGetPlatformIDs(1, &platform, NULL);
|
||||
@ -33,6 +38,8 @@ int main(void) {
|
||||
/* TODO this is wrong, it is actually an array.
|
||||
* But yeah, likely the same for all dimensions. */
|
||||
PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
|
||||
PRINT_CL_ULONG(DEVICE_LOCAL_MEM_SIZE)
|
||||
PRINT_CL_UINT(DEVICE_MAX_COMPUTE_UNITS)
|
||||
|
||||
/* Cleanup. */
|
||||
#ifdef CL_1_2
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
Matrix multiplication.
|
||||
|
||||
Based on the amazing:
|
||||
https://github.com/HandsOnOpenCL/Exercises-Solutions/tree/a908ac3f0fadede29f2735eb1264b0db7f4311a0/Solutions/Exercise08
|
||||
|
||||
The most basic / useful application where OpenCL might be faster than CPU.
|
||||
|
||||
TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations:
|
||||
@ -9,6 +12,7 @@ TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS C
|
||||
- http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
|
||||
- https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
|
||||
- http://www.netlib.org/utk/papers/autoblock/node2.html
|
||||
- http://stackoverflow.com/questions/25900312/optimizing-batched-matrix-multiplication-opencl-code
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
@ -31,7 +35,7 @@ void mat_mul_cpu(const F *A, const F *B, F *C, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Simplest possible implementation. */
|
||||
/* Simplest possible CL implementation. No speedup. */
|
||||
void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
|
||||
cl_mem buf_a, buf_b, buf_c;
|
||||
Common common;
|
||||
@ -66,6 +70,41 @@ void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
|
||||
common_deinit(&common);
|
||||
}
|
||||
|
||||
/* Cache rows in private memory. Drastic speedups expected over naive CPU. */
|
||||
void mat_mul_cl_row(const F *A, const F *B, F *C, size_t n) {
|
||||
cl_mem buf_a, buf_b, buf_c;
|
||||
Common common;
|
||||
cl_uint ncl;
|
||||
size_t global_work_size[2], mat_sizeof, n2;
|
||||
|
||||
/* Setup variables. */
|
||||
global_work_size[0] = n;
|
||||
global_work_size[1] = n;
|
||||
n2 = n * n;
|
||||
mat_sizeof = n2 * sizeof(F);
|
||||
ncl = n;
|
||||
|
||||
/* Run kernel. */
|
||||
common_init_file(&common, "matmul_row.cl");
|
||||
buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
|
||||
buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL);
|
||||
buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL);
|
||||
clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a);
|
||||
clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
|
||||
clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
|
||||
clSetKernelArg(common.kernel, 3, sizeof(ncl), &ncl);
|
||||
clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
|
||||
clFlush(common.command_queue);
|
||||
clFinish(common.command_queue);
|
||||
clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
|
||||
|
||||
/* Cleanup. */
|
||||
clReleaseMemObject(buf_a);
|
||||
clReleaseMemObject(buf_b);
|
||||
clReleaseMemObject(buf_c);
|
||||
common_deinit(&common);
|
||||
}
|
||||
|
||||
/* Check if two matrices are equal with given mean squared err_maxor. */
|
||||
int mat_eq(const F *A, const F *B, size_t n) {
|
||||
const F err_max = 10e-3;
|
||||
@ -148,7 +187,7 @@ int main(void) {
|
||||
size_t n = 1, n2, a_sizeof;
|
||||
|
||||
puts("#matmul");
|
||||
puts("n mat_mul_cpu mat_mul_cl");
|
||||
puts("n mat_mul_cpu mat_mul_cl mat_mul_cl_row");
|
||||
while(1) {
|
||||
printf("%zu ", n);
|
||||
n2 = n * n;
|
||||
@ -169,8 +208,12 @@ int main(void) {
|
||||
dt = common_get_nanos() - time;
|
||||
printf("%f ", dt);
|
||||
|
||||
/*time = common_get_nanos();*/
|
||||
/*mat_mul_cl(A, B, C, n);*/
|
||||
/*printf("%f ", common_get_nanos() - time);*/
|
||||
|
||||
time = common_get_nanos();
|
||||
mat_mul_cl(A, B, C, n);
|
||||
mat_mul_cl_row(A, B, C, n);
|
||||
printf("%f", common_get_nanos() - time);
|
||||
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
|
@ -4,9 +4,9 @@ __kernel void main(
|
||||
__global float *C,
|
||||
const uint N
|
||||
) {
|
||||
uint k;
|
||||
uint i = get_global_id(0);
|
||||
uint j = get_global_id(1);
|
||||
uint k;
|
||||
float tmp;
|
||||
|
||||
tmp = 0.0;
|
||||
|
26
opencl/matmul_row.cl
Normal file
26
opencl/matmul_row.cl
Normal file
@ -0,0 +1,26 @@
|
||||
__kernel void main(
|
||||
__global float* A,
|
||||
__global float* B,
|
||||
__global float* C,
|
||||
const uint N
|
||||
)
|
||||
{
|
||||
uint i = get_global_id(0);
|
||||
uint j, k;
|
||||
/* TODO remove hardcoded constant?
|
||||
* https://github.com/HandsOnOpenCL/Exercises-Solutions/issues/63
|
||||
* If I keep increasing, first:
|
||||
* - asserts on CPU fail as in: http://stackoverflow.com/questions/22083507/is-there-a-maximum-limit-to-private-memory-in-opencl
|
||||
* - CL complaing at compile time that this is too big and aborts
|
||||
**/
|
||||
float Ap[0x100000];
|
||||
float tmp;
|
||||
for (k = 0; k < N; k++)
|
||||
Ap[k] = A[i*N+k];
|
||||
for (j = 0; j < N; j++) {
|
||||
tmp = 0.0;
|
||||
for (k = 0; k < N; k++)
|
||||
tmp += Ap[k] * B[k*N+j];
|
||||
C[i*N+j] = tmp;
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user