diff --git a/opencl/architecture.md b/opencl/architecture.md
index bf80156..9f5125f 100644
--- a/opencl/architecture.md
+++ b/opencl/architecture.md
@@ -83,12 +83,14 @@ Contains private memory, which no other work item can see.
 
 TODO: can a single work item be run in parallel on the GPU?
 
-## Local and Private memory
+## Local memory
 
-TODO: why use those at all instead of global memory?
+## Private memory
 
-- <http://stackoverflow.com/questions/21872810/whats-the-advantage-of-the-local-memory-in-opencl>
 - <http://stackoverflow.com/questions/9885880/effect-of-private-memory-in-opencl>
+- <http://stackoverflow.com/questions/9885880/is-private-memory-slower-than-local-memory>
+
+TODO: can private memory be slower than local memory? <http://stackoverflow.com/questions/21872810/whats-the-advantage-of-the-local-memory-in-opencl>
 
 Might be faster, and global memory is limited.
 
diff --git a/opencl/clinfo.c b/opencl/clinfo.c
index 4b1b603..b1d176a 100644
--- a/opencl/clinfo.c
+++ b/opencl/clinfo.c
@@ -16,11 +16,16 @@ https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.h
     clGetDeviceInfo(device, CL_ ## id, sizeof(cl_uint), &(buf_cl_uint), NULL); \
     printf(#id " = %ju\n", (uintmax_t)buf_cl_uint);
 
+#define PRINT_CL_ULONG(id) \
+    clGetDeviceInfo(device, CL_ ## id, sizeof(cl_ulong), &(buf_cl_ulong), NULL); \
+    printf(#id " = 0x%lx\n", (uintmax_t)buf_cl_ulong);
+
 int main(void) {
-    cl_platform_id platform;
     cl_device_id device;
-    size_t buf_size_t;
+    cl_platform_id platform;
     cl_uint buf_cl_uint;
+    cl_ulong buf_cl_ulong;
+    size_t buf_size_t;
 
     /* Setup. */
     clGetPlatformIDs(1, &platform, NULL);
@@ -33,6 +38,8 @@ int main(void) {
     /* TODO this is wrong, it is actually an array.
      * But yeah, likely the same for all dimensions. */
     PRINT_SIZE_T(DEVICE_MAX_WORK_ITEM_SIZES)
+	PRINT_CL_ULONG(DEVICE_LOCAL_MEM_SIZE)
+    PRINT_CL_UINT(DEVICE_MAX_COMPUTE_UNITS)
 
     /* Cleanup. */
 #ifdef CL_1_2
diff --git a/opencl/matmul.c b/opencl/matmul.c
index 033ce28..7ffce8f 100644
--- a/opencl/matmul.c
+++ b/opencl/matmul.c
@@ -1,6 +1,9 @@
 /*
 Matrix multiplication.
 
+Based on the amazing:
+https://github.com/HandsOnOpenCL/Exercises-Solutions/tree/a908ac3f0fadede29f2735eb1264b0db7f4311a0/Solutions/Exercise08
+
 The most basic / useful application where OpenCL might be faster than CPU.
 
 TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS CPU and GPU implementations:
@@ -9,6 +12,7 @@ TODO: make a SERIOUS matrix implementation. Also compare with existing SERIOUS C
 - http://stackoverflow.com/questions/12289235/simple-and-fast-matrix-vector-multiplication-in-c-c
 - https://www.quora.com/What-is-the-best-way-to-multiply-two-matrices-in-C++
 - http://www.netlib.org/utk/papers/autoblock/node2.html
+- http://stackoverflow.com/questions/25900312/optimizing-batched-matrix-multiplication-opencl-code
 */
 
 #include "common.h"
@@ -31,7 +35,7 @@ void mat_mul_cpu(const F *A, const F *B, F *C, size_t n) {
 	}
 }
 
-/* Simplest possible implementation. */
+/* Simplest possible CL implementation. No speedup. */
 void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
     cl_mem buf_a, buf_b, buf_c;
     Common common;
@@ -66,6 +70,41 @@ void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
     common_deinit(&common);
 }
 
+/* Cache rows in private memory. Drastic speedups expected over naive CPU. */
+void mat_mul_cl_row(const F *A, const F *B, F *C, size_t n) {
+    cl_mem buf_a, buf_b, buf_c;
+    Common common;
+    cl_uint ncl;
+    size_t global_work_size[2], mat_sizeof, n2;
+
+	/* Setup variables. */
+	global_work_size[0] = n;
+	global_work_size[1] = n;
+	n2 = n * n;
+	mat_sizeof = n2 * sizeof(F);
+ 	ncl = n;
+
+	/* Run kernel. */
+    common_init_file(&common, "matmul_row.cl");
+    buf_a = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)A, NULL);
+    buf_b = clCreateBuffer(common.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mat_sizeof, (F*)B, NULL);
+    buf_c = clCreateBuffer(common.context, CL_MEM_WRITE_ONLY, mat_sizeof, C, NULL);
+    clSetKernelArg(common.kernel, 0, sizeof(buf_a), &buf_a);
+    clSetKernelArg(common.kernel, 1, sizeof(buf_b), &buf_b);
+    clSetKernelArg(common.kernel, 2, sizeof(buf_c), &buf_c);
+    clSetKernelArg(common.kernel, 3, sizeof(ncl), &ncl);
+    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
+    clFlush(common.command_queue);
+    clFinish(common.command_queue);
+    clEnqueueReadBuffer(common.command_queue, buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
+
+	/* Cleanup. */
+    clReleaseMemObject(buf_a);
+    clReleaseMemObject(buf_b);
+    clReleaseMemObject(buf_c);
+    common_deinit(&common);
+}
+
 /* Check if two matrices are equal with given mean squared err_maxor. */
 int mat_eq(const F *A, const F *B, size_t n) {
 	const F err_max = 10e-3;
@@ -148,7 +187,7 @@ int main(void) {
 		size_t n = 1, n2, a_sizeof;
 
 		puts("#matmul");
-		puts("n mat_mul_cpu mat_mul_cl");
+		puts("n mat_mul_cpu mat_mul_cl mat_mul_cl_row");
 		while(1) {
 			printf("%zu ", n);
 			n2 = n * n;
@@ -169,8 +208,12 @@ int main(void) {
 			dt = common_get_nanos() - time;
 			printf("%f ", dt);
 
+			/*time = common_get_nanos();*/
+			/*mat_mul_cl(A, B, C, n);*/
+			/*printf("%f ", common_get_nanos() - time);*/
+
 			time = common_get_nanos();
-			mat_mul_cl(A, B, C, n);
+			mat_mul_cl_row(A, B, C, n);
 			printf("%f", common_get_nanos() - time);
 
 			assert(mat_eq(C, C_ref, n));
diff --git a/opencl/matmul.cl b/opencl/matmul.cl
index f3dda62..0b08996 100644
--- a/opencl/matmul.cl
+++ b/opencl/matmul.cl
@@ -4,9 +4,9 @@ __kernel void main(
     __global float *C,
     const uint N
 ) {
-    uint k;
     uint i = get_global_id(0);
     uint j = get_global_id(1);
+    uint k;
     float tmp;
 
     tmp = 0.0;
diff --git a/opencl/matmul_row.cl b/opencl/matmul_row.cl
new file mode 100644
index 0000000..0db3f6e
--- /dev/null
+++ b/opencl/matmul_row.cl
@@ -0,0 +1,26 @@
+__kernel void main(
+    __global float* A,
+    __global float* B,
+    __global float* C,
+    const uint N
+)
+{
+    uint i = get_global_id(0);
+    uint j, k;
+    /* TODO remove hardcoded constant?
+     * https://github.com/HandsOnOpenCL/Exercises-Solutions/issues/63
+     * If I keep increasing, first:
+     * - asserts on CPU fail as in: http://stackoverflow.com/questions/22083507/is-there-a-maximum-limit-to-private-memory-in-opencl
+     * - CL complaing at compile time that this is too big and aborts
+     **/
+    float Ap[0x100000];
+    float tmp;
+    for (k = 0; k < N; k++)
+        Ap[k] = A[i*N+k];
+    for (j = 0; j < N; j++) {
+        tmp = 0.0;
+        for (k = 0; k < N; k++)
+            tmp += Ap[k] * B[k*N+j];
+        C[i*N+j] = tmp;
+    }
+}