/*
Matrix multiplication.

Based on the amazing:
https://github.com/HandsOnOpenCL/Exercises-Solutions/tree/a908ac3f0fadede29f2735eb1264b0db7f4311a0/Solutions/Exercise08

The most basic / useful application where OpenCL might be faster than CPU.

TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations:

- http://stackoverflow.com/questions/1907557/optimized-matrix-multiplication-in-c
- http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
- https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
- http://stackoverflow.com/questions/25900312/optimizing-batched-matrix-multiplication-opencl-code

Serious CPU implementation means it considers:

- caching
- SIMD

Articles:

- http://www.netlib.org/utk/papers/autoblock/node2.html
- http://codereview.stackexchange.com/questions/101144/simd-matrix-multiplication
*/

#include "common.h"

#include <cblas.h>
#include <clBLAS.h>

#define NELEMS(a) sizeof(a)/sizeof(a[0])
#define UNUSED(x) (void)(x)

/* TODO how to auto tune this?
 * GCC 6 was not smart enough to use widest type automatically:
 * anything larger than the widest type just dropped perf drastically. */
#define VECTOR_NELEMS 4
#define VECTOR_SIZEOF (VECTOR_NELEMS * sizeof(F))

typedef cl_float F;
typedef F Vec __attribute__ ((vector_size(VECTOR_SIZEOF)));
/* Stuff reused across OpenCL calls.
 * It would not be fair to consider their alocation time for each call,
 * as the functions will be called millions of times.
 *
 * TODO: also factor out the kernel compile time.
 * Currently recompiled every time.
 * Implementations already cache compiled kernels, but we would need
 * to call each function multiple times.
 * */
typedef struct {
    Common common;
    /* We only factor out the allocation of those buffers.
     * Data transfer time with clEnqueueWriteBuffer must still be considered. */
    cl_mem buf_a, buf_b, buf_c;
} Cache;
typedef void (*MatMul)(const F *A, const F *B, F *C, size_t n, Cache *cache);

void cl_buf_init(Cache *cache, size_t mat_sizeof) {
    cache->buf_a = clCreateBuffer(cache->common.context, CL_MEM_READ_ONLY, mat_sizeof, NULL, NULL);
    cache->buf_b = clCreateBuffer(cache->common.context, CL_MEM_READ_ONLY, mat_sizeof, NULL, NULL);
    cache->buf_c = clCreateBuffer(cache->common.context, CL_MEM_WRITE_ONLY, mat_sizeof, NULL, NULL);
}

void cl_buf_deinit(Cache *cache) {
    clReleaseMemObject(cache->buf_a);
    clReleaseMemObject(cache->buf_b);
    clReleaseMemObject(cache->buf_c);
}

/* No, this was not created for debugging, my code is flawless from the first try. */
void mat_print(const F *A, size_t n) {
    size_t i, j;
    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            printf("%f ", A[i*n+j]);
        }
        puts("");
    }
}

/* Zero a matrix. */
void mat_zero(F *A, size_t n) {
    size_t i, n2;
    n2 = n*n;
    for (i = 0; i < n2; ++i) {
        A[i] = 0.0;
    }
}

/* Initialize a random matrix. */
void mat_rand(F *A, size_t n) {
    size_t i, n2;
    n2 = n*n;
    for (i = 0; i < n2; ++i) {
        A[i] = ((float)rand()) / ((float)RAND_MAX);
    }
}

/* Initialize a random matrix. */
void mat_trans(F *A, size_t n) {
    size_t i, j, i1, i2;
    F tmp;
    for (i = 0; i < n; ++i) {
        for (j = 0; j < i; ++j) {
            i1 = i*n+j;
            i2 = j*n+i;
            tmp = A[i1];
            A[i1] = A[i2];
            A[i2] = tmp;
        }
    }
}

/* Check if two matrices are equal with given mean squared err_maxor. */
int mat_eq(const F *A, const F *B, size_t n) {
    const F err_max = 10e-3;
    F err, diff, a, b;
    size_t i, i_max;

    err = 0.0;
    i_max = n*n;
    for (i = 0; i < i_max; ++i) {
        a = A[i];
        b = B[i];
        diff = a - b;
        err += diff * diff;
    }
    return (sqrt(err) / i_max) < err_max;
}

void mat_assert_eq(const F *A, const F *B, size_t n) {
    if (!mat_eq(A, B, n)) {
        puts("");
        mat_print(A, n);
        puts("");
        mat_print(B, n);
        assert(0);
    }
}

size_t zmin(size_t x, size_t y) {
    return (x < y) ? x : y;
}

/* C = A*B, width n, naive. */
void mat_mul_cpu(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    F tmp;
    size_t i, j, k;
    UNUSED(cache);

    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            tmp = 0.0;
            for (k = 0; k < n; ++k) {
                tmp += A[i*n+k] * B[k*n+j];
            }
            C[i*n+j] = tmp;
        }
    }
}

/* Transpose matrix B to increase cache hits. */
void mat_mul_cpu_trans(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    F tmp;
    size_t i, j, k;
    UNUSED(cache);

    mat_trans((F*)B, n);
    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            tmp = 0.0;
            for (k = 0; k < n; ++k) {
                tmp += A[i*n+k] * B[j*n+k];
            }
            C[i*n+j] = tmp;
        }
    }
    mat_trans((F*)B, n);
}

/* Zero vector. */
void vec_zero(Vec *vec, size_t vec_n) {
    size_t i;
    for (i = 0; i < vec_n; ++i) {
        (*vec)[i] = 0.0;
    }
}

/* Load vector from array. */
void vec_load(Vec *vec, size_t vec_n, const F* A, size_t A_base) {
    size_t i;
    for (i = 0; i < vec_n; ++i) {
        (*vec)[i] = A[A_base + i];
    }
}

/* Sum elements of the vector. */
F vec_sum(Vec vec, size_t vec_n) {
    size_t i;
    F sum;
    sum = 0.0;
    for (i = 0; i < vec_n; ++i) {
        sum += vec[i];
    }
    return sum;
}

/* Transpose matrix B to both:
 *
 * - increase cache hits
 * - simd GCC vector extensions which is made possible.
 *   by the transposition, to increase likelyhood of SIMDs.
 *
 * Note that GCC 6 O=3 is smart enough to use SIMD
 * even for the naive CPU method. However this was still way faster.
 * */
void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    F tmpf;
    size_t i, j, k, k_max, ai, bi;
    Vec tmp, a, b;
    UNUSED(cache);

    mat_trans((F*)B, n);
    k_max = (n / VECTOR_NELEMS) * VECTOR_NELEMS;
    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            vec_zero(&tmp, VECTOR_NELEMS);
            for (k = 0; k < k_max; k += VECTOR_NELEMS) {
                ai = i * n + k;
                bi = j * n + k;
                vec_load(&a, VECTOR_NELEMS, A, ai);
                vec_load(&b, VECTOR_NELEMS, B, bi);
                tmp += a * b;
            }
            tmpf = 0.0;
            for (; k < n; ++k) {
                tmpf += A[i*n+k] * B[j*n+k];
            }
            C[i*n+j] = vec_sum(tmp, VECTOR_NELEMS) + tmpf;
        }
    }
    mat_trans((F*)B, n);
}

/* Blocked matrix multiplication.
 * TODO slower than transpose, sometimes similar timing to naive.
 * Why do they say that this is better?
 * */
void mat_mul_cpu_block(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    size_t ib, jb, kb, i, j, k, i_max, j_max, k_max, nb ;
    F tmp;
    UNUSED(cache);

    nb = lround(sqrt(n));
    for (ib = 0; ib < n; ib += nb) {
        i_max = zmin(ib + nb, n);
        for (jb = 0; jb < n; jb += nb) {
            k_max = zmin(jb + nb, n);
            for (kb = 0; kb < n; kb += nb) {
                j_max = zmin(kb + nb, n);

                /* TODO would be cool to recurse here, but would require more offset parameters,
                 * likely similar to tose of CBLAS. */
                for (i = ib; i < i_max; ++i) {
                    for (j = kb; j < j_max; ++j) {
                        tmp = 0.0;
                        for (k = jb; k < k_max; ++k) {
                            tmp += A[i*n+k] * B[k*n+j];
                        }
                        C[i*n+j] += tmp;
                    }
                }
            }
        }
    }
}

/* The golden single thread CPU standard. */
void mat_mul_cpu_cblas(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    UNUSED(cache);
    cblas_sgemm(
        CblasRowMajor,
        CblasNoTrans,
        CblasNoTrans,
        n,
        n,
        n,
        1.0,
        A,
        n,
        B,
        n,
        0.0,
        C,
        n
    );
}

/* Simplest possible CL implementation. No speedup. */
void mat_mul_cl(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    cl_uint ncl;
    size_t global_work_size[2], mat_sizeof;

    /* Setup variables. */
    global_work_size[0] = n;
    global_work_size[1] = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    common_create_kernel_file(&cache->common, "matmul.cl", NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, sizeof(ncl), &ncl);
    clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}

/* Cache rows in private memory. Drastic speedups expected over naive CPU. */
void mat_mul_cl_row_priv(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    char options[256];
    cl_uint ncl;
    size_t global_work_size, mat_sizeof;

    /* Setup variables. */
    global_work_size = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
    common_create_kernel_file(&cache->common, "matmul_row_priv.cl", options);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, sizeof(ncl), &ncl);
    clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}

/* Let's see if this is any different from local memory.
 * Outcome: much slower than private memory, slower than naive method. */
void mat_mul_cl_row_local(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    cl_uint ncl;
    size_t global_work_size, local_work_size, mat_sizeof;

    /* Setup variables. */
    /* Cannot be larger than 1 on this example, otherwise memory conflicts
     * will happen between work items. */
    local_work_size = 1;
    global_work_size = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    common_create_kernel_file(&cache->common, "matmul_row_local.cl", NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL);
    clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl);
    clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}

/* Like row private, but to reduce global memory accesses,
 * we copy only once per work group to local memory.
 *
 * Each work group contains a few rows of A.
 *
 * We load the first column, multiply all rows by that column, synrhconize,
 * load another column, and so on.
 *
 * This leads to a thread blockage / memory access tradeoff.
 *
 * We make work groups as large as possible to reload memory less times. */
void mat_mul_cl_row_priv_col_local(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    char options[256];
    cl_uint ncl;
    size_t global_work_size, local_work_size, mat_sizeof;

    /* Setup variables. */
    global_work_size = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
    common_create_kernel_file(&cache->common, "matmul_row_priv_col_local.cl", options);
    local_work_size = 0;
    clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL);
    local_work_size = zmin(local_work_size, n);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL);
    clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl);
    clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}

/* Copy as many cols from B as possibl to the local memory, only then start multiplying.
 * This leads to less memory barrier hits.
 * How many rows we copy is limited by the local memory size, ideally the entire matrix will fit. */
void mat_mul_cl_row_priv_cols_local(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    char options[256];
    cl_uint ncl, n_local_cols;
    cl_ulong local_mem_size;
    size_t col_size, global_work_size, local_work_size, mat_sizeof;

    /* Setup variables. */
    col_size = n * sizeof(F);
    global_work_size = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
    common_create_kernel_file(&cache->common, "matmul_row_priv_cols_local.cl", options);
    local_work_size = 0;
    clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL);
    local_work_size = zmin(local_work_size, n);
    local_mem_size = 0;
    clGetDeviceInfo(cache->common.device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
    /* TODO: can blow up without that - 1. Why?
     * It only reaches the max without it, not crosses, right?
     * So bug in the kernel? */
    n_local_cols = zmin(local_mem_size / col_size, n) - 1;
    /*puts("");*/
    /*printf("max memory %llu\n", (unsigned long long)local_mem_size);*/
    /*printf("n_local_cols %llu\n", (unsigned long long)n_local_cols);*/
    /*printf("memory %llu\n", (unsigned long long)n_local_cols * n * sizeof(F));*/
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, n_local_cols * col_size, NULL);
    clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl);
    clSetKernelArg(cache->common.kernel, 5, sizeof(n_local_cols), &n_local_cols);
    clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}

void mat_mul_cl_block(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    cl_uint ncl, nblkcl;
    size_t global_work_size[2], local_work_size[2], mat_sizeof, nblk;

    /* Setup variables. */
    global_work_size[0] = n;
    global_work_size[1] = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    common_create_kernel_file(&cache->common, "matmul_block.cl", NULL);
    clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(nblk), &nblk, NULL);
    nblk = sqrt(zmin(nblk, n));
    nblk = zmin(nblk, 3);
    nblkcl = nblk;
    local_work_size[0] = nblk;
    local_work_size[1] = nblk;
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, nblk * nblk * sizeof(F), NULL);
    printf("nblk = %llu\n", (unsigned long long)nblk);
    printf("local memory = %llu\n", (unsigned long long)2 * nblk * nblk * sizeof(F));
    clSetKernelArg(cache->common.kernel, 4, nblk * nblk * sizeof(F), NULL);
    clSetKernelArg(cache->common.kernel, 5, sizeof(ncl), &ncl);
    clSetKernelArg(cache->common.kernel, 6, sizeof(nblkcl), &nblkcl);
    clEnqueueNDRangeKernel(
        cache->common.command_queue, cache->common.kernel, 2, NULL,
        global_work_size, local_work_size, 0, NULL, NULL
    );
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}

void mat_mul_cl_clblas(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    cl_event event;
    size_t mat_sizeof;

    mat_sizeof = n * n * sizeof(F);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clblasSgemm(
        clblasRowMajor,
        clblasNoTrans,
        clblasNoTrans,

        n,
        n,
        n,
        1.0,

        cache->buf_a,
        0,
        n,

        cache->buf_b,
        0,
        n,

        0.0,
        cache->buf_c,
        0,
        n,

        1,
        &(cache->common.command_queue),
        0,
        NULL,
        &event
    );
    clWaitForEvents(1, &event);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}

double bench(MatMul f, const F *A, const F *B, F *C, F *C_ref, size_t n, Cache *cache) {
    double dt, time;
    mat_zero(C, n);
    time = common_get_nanos();
    f(A, B, C, n, cache);
    dt = common_get_nanos() - time;
    if (C_ref != NULL)
        mat_assert_eq(C, C_ref, n);
    printf("%.3e ", dt);
    fflush(stdout);
    return dt;
}

int main(int argc, char **argv) {
    srand(time(NULL));
    Cache cache;
    double max_runtime;
    /* Overly slow ones commented out by default. */
    MatMul mat_mul_funcs[] = {
        /*mat_mul_cpu,*/
        mat_mul_cpu_trans,
        mat_mul_cpu_trans_vec,
        mat_mul_cpu_block,
        mat_mul_cpu_cblas,
        /*mat_mul_cl,*/
        mat_mul_cl_row_priv,
        mat_mul_cl_row_local,
        mat_mul_cl_row_priv_col_local,
        mat_mul_cl_row_priv_cols_local,
        /* TODO broken for larger matrics, some cells contain trash.
         * Likey some memory overflow problem. */
        /*mat_mul_cl_block,*/
        mat_mul_cl_clblas,
    };
    int first, func_done[NELEMS(mat_mul_funcs)] = {0};
    size_t f, i;
    size_t mat_sizeof;

    /* CLI args. */
    if (argc > 1) {
        max_runtime = strtod(argv[1], NULL);
    } else {
        max_runtime = 1.0;
    }

    common_init(&(cache.common), NULL);

    /* Unit test 2x2. */
    {
        const F A[] = {
            1.0, 2.0,
            3.0, 4.0
        };
        const F B[] = {
            5.0, 6.0,
            7.0, 8.0
        };
        enum N { n = 2 };
        F C[n*n];
        const F C_ref[] = {
            19.0, 22.0,
            43.0, 50.0
        };
        cl_buf_init(&cache, n * n * sizeof(F));
        for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) {
            mat_zero(C, n);
            mat_mul_funcs[f](A, B, C, n, &cache);
            mat_assert_eq(C, C_ref, n);
        }
        cl_buf_deinit(&cache);
    }

    /* Unit test 4x4. */
    {
        const F A[] = {
             1.0,  2.0,  3.0,  4.0,
             5.0,  6.0,  7.0,  8.0,
             9.0, 10.0, 11.0, 12.0,
            13.0, 14.0, 15.0, 16.0,
        };
        const F B[] = {
            17.0, 18.0, 19.0, 20.0,
            21.0, 22.0, 23.0, 24.0,
            25.0, 26.0, 27.0, 28.0,
            29.0, 30.0, 31.0, 32.0,
        };
        const F C_ref[] = {
             250.0,  260.0,  270.0,  280.0,
             618.0,  644.0,  670.0,  696.0,
             986.0, 1028.0, 1070.0, 1112.0,
            1354.0, 1412.0, 1470.0, 1528.0,
        };
        enum N { n = 4 };
        F C[n*n];
        cl_buf_init(&cache, n * n * sizeof(F));
        for (f = 0; f < NELEMS(mat_mul_funcs); ++f) {
            mat_zero(C, n);
            mat_mul_funcs[f](A, B, C, n, &cache);
            mat_assert_eq(C, C_ref, n);
        }
        cl_buf_deinit(&cache);
    }

    /* Benchmarks. */
    {
        double dt;
        F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL, *dst = NULL, *ref = NULL;
        int done;
        size_t n = 2;

        puts("#matmul");
        done = 0;
        while(1) {
            printf("%zu ", (size_t)log2(n));
            mat_sizeof = n * n * sizeof(F);

            /* CPU setup. */
            A = aligned_alloc(VECTOR_SIZEOF, mat_sizeof);
            B = aligned_alloc(VECTOR_SIZEOF, mat_sizeof);
            C = aligned_alloc(VECTOR_SIZEOF, mat_sizeof);
            C_ref = aligned_alloc(VECTOR_SIZEOF, mat_sizeof);
            if (NULL == A || NULL == B || NULL == C) {
                printf("error: could not allocate memory for n = %zu", n);
                break;
            }
            mat_rand(A, n);
            mat_rand(B, n);

            cl_buf_init(&cache, mat_sizeof);
            first = 1;
            for (f = 0; f < NELEMS(mat_mul_funcs); ++f) {
                if (func_done[f]) {
                    printf("%*s", 10, "");
                } else {
                    if (first) {
                        dst = C_ref;
                        ref = NULL;
                        first = 0;
                    } else {
                        dst = C;
                        ref = C_ref;
                    }
                    dt = bench(mat_mul_funcs[f], A, B, dst, ref, n, &cache);
                    if (dt > max_runtime)
                        func_done[f] = 1;
                }
            }
            puts("");
            done = 1;
            for (i = 0; i < NELEMS(mat_mul_funcs); ++i) {
                if (!func_done[i]) {
                    done = 0;
                    break;
                }
            }
            if (done)
                break;
            n *= 2;

            /* CPU deinit. */
            free(A);
            free(B);
            free(C);
            free(C_ref);

            cl_buf_deinit(&cache);
        }
        common_deinit(&cache.common);
    }

    return EXIT_SUCCESS;
}