Matrix multiplication with cblas, block works now, reformat lapack cheat a bit

2025-04-13 00:30:27 +00:00 · 2017-03-18 10:13:02 +00:00 · 2017-03-18 10:13:02 +00:00 · b57712c0af
commit b57712c0af
parent e4c287ac6d
6 changed files with 177 additions and 153 deletions
--- a/lapack/README.md
+++ b/lapack/README.md
@ -8,7 +8,7 @@ BLAS and LAPACK are:

 -   de-facto standards

-   non-parallel 
+-   non-parallel

 -   originally written in Fortran

@ -30,8 +30,6 @@ BLAS contains low level functions such as:
 - vector matrix multiplication
 - matrix matrix multiplication

-The BLAS project provides `cblas.h`, which contains a C interface for BLAS.
-
 ### LAPACK

 LAPACK contains higher level functions such as:
@ -61,21 +59,19 @@ Implements full BLAS, but only part of LAPACK.

 Has C interface.

-## Installation on Ubuntu
+## C interface

-### Fortran
+The BLAS project provides `cblas.h`, which contains a C interface for BLAS (TODO but also an implementation?)

-    sudo aptitude install liblapack-dev liblapack-doc libblas-doc
+Via atlas:

-### C interface
+    sudo aptitude install libatlas-dev
+    gcc -lcblas

-via atlas:
-
-    sudo aptitude install 
-
-via LAPACKE (`libblas-dev` already contains `cblas.h`):
+Via LAPACKE (`libblas-dev` already contains `cblas.h`):

    sudo aptitude install liblapacke-dev
+    gcc -lblas

 ## Levels

@ -85,6 +81,8 @@ via LAPACKE (`libblas-dev` already contains `cblas.h`):

 ## Function naming conventions

+<http://www.netlib.org/lapack/lug/node24.html>
+
 The functions are named according to the pattern:

    XYYZZZ
@ -108,6 +106,8 @@ Where:
 -   `ZZ`: computation to be done:

    - `SV`: SolVe linear system
+    - `MM`: Matrix Multiply
+    - `LS`: Least Squares (overdetermined system)

 ## Sources

@ -118,3 +118,7 @@ Where:
 - user's guide. algorithm info <http://www.netlib.org/lapack/lug/>

 - <http://www.tat.physik.uni-tuebingen.de/~kley/lehre/ftn77/tutorial/blas.html>
+
+## LAPACKE
+
+<http://stackoverflow.com/questions/26875415/difference-between-lapacke-and-lapack>
--- a/lapack/c.c
+++ b/lapack/c.c
@ -14,103 +14,97 @@ interfaces provided by their respective projects, repectively through
 #include <lapacke.h>

 /**
-* assert two integers are equal
-* if not, print them to stderr and assert false
-*/
-void assert_eqi( int i1, int i2 )
-{
-    if( i1 != i2 )
-    {
-        fprintf( stderr, "%d\n%d\n\n\n", i1, i2 );
-        assert( false );
+ * assert two integers are equal
+ * if not, print them to stderr and assert false
+ */
+void assert_eqi(int i1, int i2) {
+    if (i1 != i2) {
+        fprintf(stderr, "%d\n%d\n\n\n", i1, i2);
+        assert(false);
    }
 }

-/*
-* assert two doubles are equal within err precision
-* if not, print them to stderr
-*/
-void assert_eqd( double d1, double d2, double err )
-{
-    if( fabs( d1 - d2 ) > err )
-    {
-        fprintf( stderr, "%f\n%f\n\n\n", d1, d2 );
-        assert( false );
+/**
+ * assert two doubles are equal within err precision
+ * if not, print them to stderr
+ */
+void assert_eqd(double d1, double d2, double err) {
+    if (fabs(d1 - d2) > err) {
+        fprintf(stderr, "%f\n%f\n\n\n", d1, d2);
+        assert(false);
    }
 }

-/*
-* print an array of doubles to stderr
-*/
-void print_vecd( int n, double * v )
-{
+/** print an array of doubles to stderr */
+void print_vecd(int n, double * v) {
    int i;
-    for(i=0; i<n; i++)
-    {
-        fprintf( stderr, "%.2f ", v[i] );
+    for (i=0; i<n; i++) {
+        fprintf(stderr, "%.2f ", v[i]);
    }
    printf("\n");
 }

 /*
-bool eq_vecd2( int n, double * v1, double * v2, double err)
+bool eq_vecd2(int n, double * v1, double * v2, double err)
 {
-    if( fabs( d1 - d2 ) > err )
+    if(fabs(d1 - d2) > err)
        return false
    return true;
 }

-void assert_eqvd( int n, double * v1, double * v2, double err){
-    if( eq_vecd( n, v1, v2, err ) != true ){
-        print_vecd( n, v1 );
-        print_vecd( n, v2 );
+void assert_eqvd(int n, double * v1, double * v2, double err){
+    if(eq_vecd(n, v1, v2, err) != true){
+        print_vecd(n, v1);
+        print_vecd(n, v2);
    }
 }
 */

-int main(void)
-{
+int main(void) {
    int info, ipiv2[2];
    float err = 1e-6;
    float x2[2], b2[2], c2[2];
    float a2x2[2][2];

-    //#cblas
-
+	/* cblas */
+	{
        x2[0] =  1.0;
        x2[1] = -2.0;
-        assert_eqd( cblas_snrm2(2, x2, 1), sqrt(5.0), err );
+        assert_eqd(cblas_snrm2(2, x2, 1), sqrt(5.0), err);
+    }

-    //#lapacke
+	/* lapacke */
+	{
+		/*
+		sgesv

-        //1 2 x = 5
-        //3 4 y   11
+		Matrix vector multiply.
+		*/
+		{
+			a2x2[0][0] = 1.0;
+			a2x2[1][0] = 2.0;
+			a2x2[0][1] = 3.0;
+			a2x2[1][1] = 4.0;
+			b2[0] = 5.;
+			b2[1] = 11.;

-        //x = 1
-        //y = 1
-
-        a2x2[0][0] = 1.0;
-        a2x2[1][0] = 2.0;
-        a2x2[0][1] = 3.0;
-        a2x2[1][1] = 4.0;
-        b2[0] = 5.;
-        b2[1] = 11.;
-
-        info = LAPACKE_sgesv(
-            LAPACK_COL_MAJOR, //COL or ROW
-            2,                //n
-            1,                //nrhs
-            &a2x2[0][0],
-            2,                //lda
-            &ipiv2[0],
-            &b2[0],
-            2                 //ldb
-        );
-        c2[0] = 1.0;
-        c2[1] = 2.0;
-        assert_eqi( info, 0 );
-        assert_eqd( b2[0], c2[0], err );
-        assert_eqd( b2[1], c2[1], err );
+			info = LAPACKE_sgesv(
+				LAPACK_COL_MAJOR,
+				2,
+				1,
+				&a2x2[0][0],
+				2,
+				&ipiv2[0],
+				&b2[0],
+				2
+			);
+			c2[0] = 1.0;
+			c2[1] = 2.0;
+			assert_eqi(info, 0);
+			assert_eqd(b2[0], c2[0], err);
+			assert_eqd(b2[1], c2[1], err);
+		}
+    }

    return EXIT_SUCCESS;
 }
--- a/lapack/configure
+++ b/lapack/configure
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 sudo aptitude update
 # BLAS C/Fotran and LAPACK Fortran:
-sudo aptitude install -y liblapack-dev
+sudo aptitude install -y liblapack-dev liblapacke-dev
 # Lapack C via LAPACKE:
 # TODO: how to install on Ubuntu 12.04?
 # The following works only on later Ubuntu
--- a/lapack/main.f
+++ b/lapack/main.f
@ -171,7 +171,7 @@
            !return status returned on `info`:
            !pivots returned on `pivots`:
            call sgesv( n, nrhs, a2x2, lda, pivots, b2, ldb, info )
-            
+
            c2(1) = 1.0
            c2(2) = 2.0
            call assert_eqi( info, 0 )
--- a/opencl/Makefile_params
+++ b/opencl/Makefile_params
@ -1 +1,3 @@
-LIBS := -lm -pthread -lOpenCL
+LIBS := -lm -pthread -lOpenCL -lblas
+# blas from libatlas-base-dev
+# cblas from libblas-dev
--- a/opencl/matmul.c
+++ b/opencl/matmul.c
@ -26,10 +26,12 @@ Articles:

 #include "common.h"

+#include <cblas.h>
+
 #define VECTOR_SIZE (4 * sizeof(F))

 typedef cl_float F;
-typedef cl_float F4 __attribute__ ((vector_size(VECTOR_SIZE)));
+typedef F F4 __attribute__ ((vector_size(VECTOR_SIZE)));
 typedef void (*MatMul)(const F *A, const F *B, F *C, size_t n);

 /* No, this was not created for debugging, my code is flawless from the first try. */
@ -166,31 +168,61 @@ void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n) {
    mat_trans((F*)B, n);
 }

+size_t zmin(size_t x, size_t y) {
+	return (x < y) ? x : y;
+}
+
 /* Blocked matrix multiplication.
- * Assumes n is power of 2.
- * TODO slower than transpose, sometimes similar timing to naive.*/
+ * TODO slower than transpose, sometimes similar timing to naive.
+ * Why do they say that this is better? */
 void mat_mul_cpu_block(const F *A, const F *B, F *C, size_t n) {
-    size_t nb = sqrt(n);
-    size_t ib, jb, i, j, i2, j2, k;
+    size_t ib, jb, kb, i, j, k, i_max, j_max, k_max, nb ;
    F tmp;

-    for (ib = 0; ib < nb; ++ib) {
-        for (jb = 0; jb < nb; ++jb) {
-            for (i = 0; i < nb; ++i) {
-                for (j = 0; j < nb; ++j) {
-                    i2 = nb * ib + i;
-                    j2 = nb * jb + j;
-                    tmp = 0.0;
-                    for (k = 0; k < n; ++k) {
-                        tmp += A[i2*n+k] * B[k*n+j2];
-                    }
-                    C[i2*n+j2] = tmp;
-                }
+	nb = lround(sqrt(n));
+    for (ib = 0; ib < n; ib += nb) {
+		i_max = zmin(ib + nb, n);
+        for (jb = 0; jb < n; jb += nb) {
+			k_max = zmin(jb + nb, n);
+			for (kb = 0; kb < n; kb += nb) {
+				j_max = zmin(kb + nb, n);
+
+				/* TODO would be cool to recurse here, but would require more offset parameters,
+				 * likely similar to tose of CBLAS. */
+				for (i = ib; i < i_max; ++i) {
+					for (j = kb; j < j_max; ++j) {
+						tmp = 0.0;
+						for (k = jb; k < k_max; ++k) {
+							tmp += A[i*n+k] * B[k*n+j];
+						}
+						C[i*n+j] += tmp;
+					}
+				}
            }
        }
    }
 }

+/* The golden single thread CPU standard. */
+void mat_mul_cpu_cblas(const F *A, const F *B, F *C, size_t n) {
+    cblas_sgemm(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasNoTrans,
+        n,
+        n,
+        n,
+        1.0,
+        A,
+        n,
+        B,
+        n,
+        1.0,
+        C,
+        n
+    );
+}
+
 /* Simplest possible CL implementation. No speedup. */
 void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
    cl_mem buf_a, buf_b, buf_c;
@ -261,18 +293,34 @@ void mat_mul_cl_row(const F *A, const F *B, F *C, size_t n) {
    common_deinit(&common);
 }

+
+double bench(MatMul f, const F *A, const F *B, F *C, F *C_ref, size_t n) {
+	double dt, time;
+	mat_zero(C, n);
+	time = common_get_nanos();
+	f(A, B, C, n);
+	dt = common_get_nanos() - time;
+	if (C_ref != NULL)
+		assert(mat_eq(C, C_ref, n));
+	printf("%f ", dt);
+	return dt;
+}
+
 int main(int argc, char **argv) {
    srand(time(NULL));
    double max_cpu_runtime;
-    /* TODO stop being lazy and use this. */
-    /*MatMul mat_mul[] = {*/
-        /*mat_mul_cpu,*/
-        /*mat_mul_cpu_trans,*/
-        /*mat_mul_cpu_trans_vec,*/
-        /*mat_mul_cl,*/
-        /*mat_mul_cl_row,*/
-    /*};*/
+	MatMul mat_mul_funcs[] = {
+		mat_mul_cpu_trans,
+		mat_mul_cpu_trans_vec,
+		mat_mul_cpu_block,
+		mat_mul_cpu_cblas,
+		/* Comment out, because this overflows GPU memory and blocks computer
+		 * before the others get to meaningful times. */
+		/*mat_mul_cl,*/
+		mat_mul_cl_row,
+	};

+	/* CLI args. */
    if (argc > 1) {
        max_cpu_runtime = strtod(argv[1], NULL);
    } else {
@ -291,26 +339,34 @@ int main(int argc, char **argv) {
        };
        enum N { n = 2 };
        F C[n*n];
-        const F C_expect[] = {
+        const F C_ref[] = {
            19.0, 22.0,
            43.0, 50.0
        };

        mat_zero(C, n);
        mat_mul_cpu(A, B, C, n);
-        assert(mat_eq(C, C_expect, n));
+        assert(mat_eq(C, C_ref, n));

        mat_zero(C, n);
        mat_mul_cpu_trans(A, B, C, n);
-        assert(mat_eq(C, C_expect, n));
+        assert(mat_eq(C, C_ref, n));
+
+        mat_zero(C, n);
+        mat_mul_cpu_block(A, B, C, n);
+        assert(mat_eq(C, C_ref, n));
+
+        mat_zero(C, n);
+        mat_mul_cpu_cblas(A, B, C, n);
+        assert(mat_eq(C, C_ref, n));

        mat_zero(C, n);
        mat_mul_cl(A, B, C, n);
-        assert(mat_eq(C, C_expect, n));
+        assert(mat_eq(C, C_ref, n));

        mat_zero(C, n);
        mat_mul_cl_row(A, B, C, n);
-        assert(mat_eq(C, C_expect, n));
+        assert(mat_eq(C, C_ref, n));
    }

    /* Unit test for vector implementation, which requires 4x4 matrices. */
@ -327,7 +383,7 @@ int main(int argc, char **argv) {
           25.0, 26.0, 27.0, 28.0,
           29.0, 30.0, 31.0, 32.0,
        };
-        const F C_expect[] = {
+        const F C_ref[] = {
            250.000000, 260.000000, 270.000000, 280.000000,
            618.000000, 644.000000, 670.000000, 696.000000,
            986.000000, 1028.000000, 1070.000000, 1112.000000,
@ -336,23 +392,19 @@ int main(int argc, char **argv) {
        enum N { n = 4 };
        F C[n*n];

-        mat_zero(C, n);
-        mat_mul_cpu_block(A, B, C, n);
-        assert(mat_eq(C, C_expect, n));
-
        mat_zero(C, n);
        mat_mul_cpu_trans_vec(A, B, C, n);
-        assert(mat_eq(C, C_expect, n));
+        assert(mat_eq(C, C_ref, n));
    }

    /* Benchmarks. */
    {
+		double dt;
        F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL;
-        double dt, time;
-        size_t n = 4, a_sizeof;
+        size_t f, n = 4, a_sizeof;

        puts("#matmul");
-        puts("n cpu cpu_trans cpu_trans_vec cpu_block cl_row");
+        puts("n cpu cpu_trans cpu_trans_vec cpu_block cpu_cblas cl_row");
        a_sizeof = n * n * sizeof(F);
        A = aligned_alloc(VECTOR_SIZE, a_sizeof);
        B = aligned_alloc(VECTOR_SIZE, a_sizeof);
@ -361,43 +413,15 @@ int main(int argc, char **argv) {
        while(1) {
            printf("%zu ", n);
            if (A == NULL || B == NULL || C == NULL) {
-                printf("Could not allocate memory for n = %zu", n);
+                printf("Could not allocate memory for n = %zu. Aborting.", n);
                break;
            }
            mat_rand(A, n);
            mat_rand(B, n);
-
-            time = common_get_nanos();
-            mat_mul_cpu(A, B, C_ref, n);
-            dt = common_get_nanos() - time;
-            printf("%f ", dt);
-
-            time = common_get_nanos();
-            mat_mul_cpu_trans(A, B, C, n);
-            assert(mat_eq(C, C_ref, n));
-            printf("%f ", common_get_nanos() - time);
-
-            time = common_get_nanos();
-            mat_mul_cpu_trans_vec(A, B, C, n);
-            assert(mat_eq(C, C_ref, n));
-            printf("%f ", common_get_nanos() - time);
-
-            time = common_get_nanos();
-            mat_mul_cpu_block(A, B, C, n);
-            assert(mat_eq(C, C_ref, n));
-            printf("%f ", common_get_nanos() - time);
-
-            /* Comment out, because this overflows GPU memory and blocks computer
-             * before the others get to meaningful times. */
-            /*time = common_get_nanos();*/
-            /*mat_mul_cl(A, B, C, n);*/
-            /*printf("%f ", common_get_nanos() - time);*/
-
-            time = common_get_nanos();
-            mat_mul_cl_row(A, B, C, n);
-            printf("%f ", common_get_nanos() - time);
-            assert(mat_eq(C, C_ref, n));
-
+			dt = bench(mat_mul_cpu, A, B, C_ref, NULL, n);
+			for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) {
+				bench(mat_mul_funcs[f], A, B, C, C_ref, n);
+			}
            puts("");
            if (dt > max_cpu_runtime)
                break;