/* Matrix multiplication. Based on the amazing: https://github.com/HandsOnOpenCL/Exercises-Solutions/tree/a908ac3f0fadede29f2735eb1264b0db7f4311a0/Solutions/Exercise08 The most basic / useful application where OpenCL might be faster than CPU. TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations: - http://stackoverflow.com/questions/1907557/optimized-matrix-multiplication-in-c - http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c - https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++ - http://stackoverflow.com/questions/25900312/optimizing-batched-matrix-multiplication-opencl-code Serious CPU implementation means it considers: - caching - SIMD Articles: - http://www.netlib.org/utk/papers/autoblock/node2.html - http://codereview.stackexchange.com/questions/101144/simd-matrix-multiplication */ #include "common.h" #include #include #define NELEMS(a) sizeof(a)/sizeof(a[0]) #define UNUSED(x) (void)(x) /* TODO how to auto tune this? * GCC 6 was not smart enough to use widest type automatically: * anything larger than the widest type just dropped perf drastically. */ #define VECTOR_NELEMS 4 #define VECTOR_SIZEOF (VECTOR_NELEMS * sizeof(F)) typedef cl_float F; typedef F Vec __attribute__ ((vector_size(VECTOR_SIZEOF))); /* Stuff reused across OpenCL calls. * It would not be fair to consider their alocation time for each call, * as the functions will be called millions of times. * * TODO: also factor out the kernel compile time. * Currently recompiled every time. * Implementations already cache compiled kernels, but we would need * to call each function multiple times. * */ typedef struct { Common common; /* We only factor out the allocation of those buffers. * Data transfer time with clEnqueueWriteBuffer must still be considered. */ cl_mem buf_a, buf_b, buf_c; } Cache; typedef void (*MatMul)(const F *A, const F *B, F *C, size_t n, Cache *cache); void cl_buf_init(Cache *cache, size_t mat_sizeof) { cache->buf_a = clCreateBuffer(cache->common.context, CL_MEM_READ_ONLY, mat_sizeof, NULL, NULL); cache->buf_b = clCreateBuffer(cache->common.context, CL_MEM_READ_ONLY, mat_sizeof, NULL, NULL); cache->buf_c = clCreateBuffer(cache->common.context, CL_MEM_WRITE_ONLY, mat_sizeof, NULL, NULL); } void cl_buf_deinit(Cache *cache) { clReleaseMemObject(cache->buf_a); clReleaseMemObject(cache->buf_b); clReleaseMemObject(cache->buf_c); } /* No, this was not created for debugging, my code is flawless from the first try. */ void mat_print(const F *A, size_t n) { size_t i, j; for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { printf("%f ", A[i*n+j]); } puts(""); } } /* Zero a matrix. */ void mat_zero(F *A, size_t n) { size_t i, n2; n2 = n*n; for (i = 0; i < n2; ++i) { A[i] = 0.0; } } /* Initialize a random matrix. */ void mat_rand(F *A, size_t n) { size_t i, n2; n2 = n*n; for (i = 0; i < n2; ++i) { A[i] = ((float)rand()) / ((float)RAND_MAX); } } /* Initialize a random matrix. */ void mat_trans(F *A, size_t n) { size_t i, j, i1, i2; F tmp; for (i = 0; i < n; ++i) { for (j = 0; j < i; ++j) { i1 = i*n+j; i2 = j*n+i; tmp = A[i1]; A[i1] = A[i2]; A[i2] = tmp; } } } /* Check if two matrices are equal with given mean squared err_maxor. */ int mat_eq(const F *A, const F *B, size_t n) { const F err_max = 10e-3; F err, diff, a, b; size_t i, i_max; err = 0.0; i_max = n*n; for (i = 0; i < i_max; ++i) { a = A[i]; b = B[i]; diff = a - b; err += diff * diff; } return (sqrt(err) / i_max) < err_max; } void mat_assert_eq(const F *A, const F *B, size_t n) { if (!mat_eq(A, B, n)) { puts(""); mat_print(A, n); puts(""); mat_print(B, n); assert(0); } } size_t zmin(size_t x, size_t y) { return (x < y) ? x : y; } /* C = A*B, width n, naive. */ void mat_mul_cpu(const F *A, const F *B, F *C, size_t n, Cache *cache) { F tmp; size_t i, j, k; UNUSED(cache); for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { tmp = 0.0; for (k = 0; k < n; ++k) { tmp += A[i*n+k] * B[k*n+j]; } C[i*n+j] = tmp; } } } /* Transpose matrix B to increase cache hits. */ void mat_mul_cpu_trans(const F *A, const F *B, F *C, size_t n, Cache *cache) { F tmp; size_t i, j, k; UNUSED(cache); mat_trans((F*)B, n); for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { tmp = 0.0; for (k = 0; k < n; ++k) { tmp += A[i*n+k] * B[j*n+k]; } C[i*n+j] = tmp; } } mat_trans((F*)B, n); } /* Zero vector. */ void vec_zero(Vec *vec, size_t vec_n) { size_t i; for (i = 0; i < vec_n; ++i) { (*vec)[i] = 0.0; } } /* Load vector from array. */ void vec_load(Vec *vec, size_t vec_n, const F* A, size_t A_base) { size_t i; for (i = 0; i < vec_n; ++i) { (*vec)[i] = A[A_base + i]; } } /* Sum elements of the vector. */ F vec_sum(Vec vec, size_t vec_n) { size_t i; F sum; sum = 0.0; for (i = 0; i < vec_n; ++i) { sum += vec[i]; } return sum; } /* Transpose matrix B to both: * * - increase cache hits * - simd GCC vector extensions which is made possible. * by the transposition, to increase likelyhood of SIMDs. * * Note that GCC 6 O=3 is smart enough to use SIMD * even for the naive CPU method. However this was still way faster. * */ void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n, Cache *cache) { F tmpf; size_t i, j, k, k_max, ai, bi; Vec tmp, a, b; UNUSED(cache); mat_trans((F*)B, n); k_max = (n / VECTOR_NELEMS) * VECTOR_NELEMS; for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { vec_zero(&tmp, VECTOR_NELEMS); for (k = 0; k < k_max; k += VECTOR_NELEMS) { ai = i * n + k; bi = j * n + k; vec_load(&a, VECTOR_NELEMS, A, ai); vec_load(&b, VECTOR_NELEMS, B, bi); tmp += a * b; } tmpf = 0.0; for (; k < n; ++k) { tmpf += A[i*n+k] * B[j*n+k]; } C[i*n+j] = vec_sum(tmp, VECTOR_NELEMS) + tmpf; } } mat_trans((F*)B, n); } /* Blocked matrix multiplication. * TODO slower than transpose, sometimes similar timing to naive. * Why do they say that this is better? * */ void mat_mul_cpu_block(const F *A, const F *B, F *C, size_t n, Cache *cache) { size_t ib, jb, kb, i, j, k, i_max, j_max, k_max, nb ; F tmp; UNUSED(cache); nb = lround(sqrt(n)); for (ib = 0; ib < n; ib += nb) { i_max = zmin(ib + nb, n); for (jb = 0; jb < n; jb += nb) { k_max = zmin(jb + nb, n); for (kb = 0; kb < n; kb += nb) { j_max = zmin(kb + nb, n); /* TODO would be cool to recurse here, but would require more offset parameters, * likely similar to tose of CBLAS. */ for (i = ib; i < i_max; ++i) { for (j = kb; j < j_max; ++j) { tmp = 0.0; for (k = jb; k < k_max; ++k) { tmp += A[i*n+k] * B[k*n+j]; } C[i*n+j] += tmp; } } } } } } /* The golden single thread CPU standard. */ void mat_mul_cpu_cblas(const F *A, const F *B, F *C, size_t n, Cache *cache) { UNUSED(cache); cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, A, n, B, n, 0.0, C, n ); } /* Simplest possible CL implementation. No speedup. */ void mat_mul_cl(const F *A, const F *B, F *C, size_t n, Cache *cache) { cl_uint ncl; size_t global_work_size[2], mat_sizeof; /* Setup variables. */ global_work_size[0] = n; global_work_size[1] = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ common_create_kernel_file(&cache->common, "matmul.cl", NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, sizeof(ncl), &ncl); clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); } /* Cache rows in private memory. Drastic speedups expected over naive CPU. */ void mat_mul_cl_row_priv(const F *A, const F *B, F *C, size_t n, Cache *cache) { char options[256]; cl_uint ncl; size_t global_work_size, mat_sizeof; /* Setup variables. */ global_work_size = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n); common_create_kernel_file(&cache->common, "matmul_row_priv.cl", options); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, sizeof(ncl), &ncl); clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); } /* Let's see if this is any different from local memory. * Outcome: much slower than private memory, slower than naive method. */ void mat_mul_cl_row_local(const F *A, const F *B, F *C, size_t n, Cache *cache) { cl_uint ncl; size_t global_work_size, local_work_size, mat_sizeof; /* Setup variables. */ /* Cannot be larger than 1 on this example, otherwise memory conflicts * will happen between work items. */ local_work_size = 1; global_work_size = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ common_create_kernel_file(&cache->common, "matmul_row_local.cl", NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL); clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl); clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); } /* Like row private, but to reduce global memory accesses, * we copy only once per work group to local memory. * * Each work group contains a few rows of A. * * We load the first column, multiply all rows by that column, synrhconize, * load another column, and so on. * * This leads to a thread blockage / memory access tradeoff. * * We make work groups as large as possible to reload memory less times. */ void mat_mul_cl_row_priv_col_local(const F *A, const F *B, F *C, size_t n, Cache *cache) { char options[256]; cl_uint ncl; size_t global_work_size, local_work_size, mat_sizeof; /* Setup variables. */ global_work_size = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n); common_create_kernel_file(&cache->common, "matmul_row_priv_col_local.cl", options); local_work_size = 0; clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL); local_work_size = zmin(local_work_size, n); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL); clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl); clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); } /* Copy as many cols from B as possibl to the local memory, only then start multiplying. * This leads to less memory barrier hits. * How many rows we copy is limited by the local memory size, ideally the entire matrix will fit. */ void mat_mul_cl_row_priv_cols_local(const F *A, const F *B, F *C, size_t n, Cache *cache) { char options[256]; cl_uint ncl, n_local_cols; cl_ulong local_mem_size; size_t col_size, global_work_size, local_work_size, mat_sizeof; /* Setup variables. */ col_size = n * sizeof(F); global_work_size = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n); common_create_kernel_file(&cache->common, "matmul_row_priv_cols_local.cl", options); local_work_size = 0; clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL); local_work_size = zmin(local_work_size, n); local_mem_size = 0; clGetDeviceInfo(cache->common.device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL); /* TODO: can blow up without that - 1. Why? * It only reaches the max without it, not crosses, right? * So bug in the kernel? */ n_local_cols = zmin(local_mem_size / col_size, n) - 1; /*puts("");*/ /*printf("max memory %llu\n", (unsigned long long)local_mem_size);*/ /*printf("n_local_cols %llu\n", (unsigned long long)n_local_cols);*/ /*printf("memory %llu\n", (unsigned long long)n_local_cols * n * sizeof(F));*/ clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, n_local_cols * col_size, NULL); clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl); clSetKernelArg(cache->common.kernel, 5, sizeof(n_local_cols), &n_local_cols); clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); } void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n, Cache *cache) { cl_uint ncl, nblkcl; size_t global_work_size[2], local_work_size[2], mat_sizeof, nblk; /* Setup variables. */ global_work_size[0] = n; global_work_size[1] = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ common_create_kernel_file(&cache->common, "matmul_block.cl", NULL); clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(nblk), &nblk, NULL); nblk = sqrt(zmin(nblk, n)); nblk = zmin(nblk, 3); nblkcl = nblk; local_work_size[0] = nblk; local_work_size[1] = nblk; clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, nblk * nblk * sizeof(F), NULL); printf("nblk = %llu\n", (unsigned long long)nblk); printf("local memory = %llu\n", (unsigned long long)2 * nblk * nblk * sizeof(F)); clSetKernelArg(cache->common.kernel, 4, nblk * nblk * sizeof(F), NULL); clSetKernelArg(cache->common.kernel, 5, sizeof(ncl), &ncl); clSetKernelArg(cache->common.kernel, 6, sizeof(nblkcl), &nblkcl); clEnqueueNDRangeKernel( cache->common.command_queue, cache->common.kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL ); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); } void mat_mul_cl_clblas(const F *A, const F *B, F *C, size_t n, Cache *cache) { cl_event event; size_t mat_sizeof; mat_sizeof = n * n * sizeof(F); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, n, n, n, 1.0, cache->buf_a, 0, n, cache->buf_b, 0, n, 0.0, cache->buf_c, 0, n, 1, &(cache->common.command_queue), 0, NULL, &event ); clWaitForEvents(1, &event); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); } double bench(MatMul f, const F *A, const F *B, F *C, F *C_ref, size_t n, Cache *cache) { double dt, time; mat_zero(C, n); time = common_get_nanos(); f(A, B, C, n, cache); dt = common_get_nanos() - time; if (C_ref != NULL) mat_assert_eq(C, C_ref, n); printf("%.3e ", dt); fflush(stdout); return dt; } int main(int argc, char **argv) { srand(time(NULL)); Cache cache; double max_runtime; /* Overly slow ones commented out by default. */ MatMul mat_mul_funcs[] = { /*mat_mul_cpu,*/ mat_mul_cpu_trans, mat_mul_cpu_trans_vec, mat_mul_cpu_block, mat_mul_cpu_cblas, /*mat_mul_cl,*/ mat_mul_cl_row_priv, mat_mul_cl_row_local, mat_mul_cl_row_priv_col_local, mat_mul_cl_row_priv_cols_local, /* TODO broken for larger matrics, some cells contain trash. * Likey some memory overflow problem. */ /*mat_mul_cl_block,*/ mat_mul_cl_clblas, }; int first, func_done[NELEMS(mat_mul_funcs)] = {0}; size_t f, i; size_t mat_sizeof; /* CLI args. */ if (argc > 1) { max_runtime = strtod(argv[1], NULL); } else { max_runtime = 1.0; } common_init(&(cache.common), NULL); /* Unit test 2x2. */ { const F A[] = { 1.0, 2.0, 3.0, 4.0 }; const F B[] = { 5.0, 6.0, 7.0, 8.0 }; enum N { n = 2 }; F C[n*n]; const F C_ref[] = { 19.0, 22.0, 43.0, 50.0 }; cl_buf_init(&cache, n * n * sizeof(F)); for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) { mat_zero(C, n); mat_mul_funcs[f](A, B, C, n, &cache); mat_assert_eq(C, C_ref, n); } cl_buf_deinit(&cache); } /* Unit test 4x4. */ { const F A[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, }; const F B[] = { 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, }; const F C_ref[] = { 250.0, 260.0, 270.0, 280.0, 618.0, 644.0, 670.0, 696.0, 986.0, 1028.0, 1070.0, 1112.0, 1354.0, 1412.0, 1470.0, 1528.0, }; enum N { n = 4 }; F C[n*n]; cl_buf_init(&cache, n * n * sizeof(F)); for (f = 0; f < NELEMS(mat_mul_funcs); ++f) { mat_zero(C, n); mat_mul_funcs[f](A, B, C, n, &cache); mat_assert_eq(C, C_ref, n); } cl_buf_deinit(&cache); } /* Benchmarks. */ { double dt; F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL, *dst = NULL, *ref = NULL; int done; size_t n = 2; puts("#matmul"); done = 0; while(1) { printf("%zu ", (size_t)log2(n)); mat_sizeof = n * n * sizeof(F); /* CPU setup. */ A = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); B = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); C = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); C_ref = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); if (NULL == A || NULL == B || NULL == C) { printf("error: could not allocate memory for n = %zu", n); break; } mat_rand(A, n); mat_rand(B, n); cl_buf_init(&cache, mat_sizeof); first = 1; for (f = 0; f < NELEMS(mat_mul_funcs); ++f) { if (func_done[f]) { printf("%*s", 10, ""); } else { if (first) { dst = C_ref; ref = NULL; first = 0; } else { dst = C; ref = C_ref; } dt = bench(mat_mul_funcs[f], A, B, dst, ref, n, &cache); if (dt > max_runtime) func_done[f] = 1; } } puts(""); done = 1; for (i = 0; i < NELEMS(mat_mul_funcs); ++i) { if (!func_done[i]) { done = 0; break; } } if (done) break; n *= 2; /* CPU deinit. */ free(A); free(B); free(C); free(C_ref); cl_buf_deinit(&cache); } common_deinit(&cache.common); } return EXIT_SUCCESS; }