Matrix multiplication with cblas, block works now, reformat lapack cheat a bit

This commit is contained in:
Ciro Santilli 2017-03-18 10:13:02 +00:00
parent e4c287ac6d
commit b57712c0af
6 changed files with 177 additions and 153 deletions

View File

@ -8,7 +8,7 @@ BLAS and LAPACK are:
- de-facto standards
- non-parallel
- non-parallel
- originally written in Fortran
@ -30,8 +30,6 @@ BLAS contains low level functions such as:
- vector matrix multiplication
- matrix matrix multiplication
The BLAS project provides `cblas.h`, which contains a C interface for BLAS.
### LAPACK
LAPACK contains higher level functions such as:
@ -61,21 +59,19 @@ Implements full BLAS, but only part of LAPACK.
Has C interface.
## Installation on Ubuntu
## C interface
### Fortran
The BLAS project provides `cblas.h`, which contains a C interface for BLAS (TODO but also an implementation?)
sudo aptitude install liblapack-dev liblapack-doc libblas-doc
Via atlas:
### C interface
sudo aptitude install libatlas-dev
gcc -lcblas
via atlas:
sudo aptitude install
via LAPACKE (`libblas-dev` already contains `cblas.h`):
Via LAPACKE (`libblas-dev` already contains `cblas.h`):
sudo aptitude install liblapacke-dev
gcc -lblas
## Levels
@ -85,6 +81,8 @@ via LAPACKE (`libblas-dev` already contains `cblas.h`):
## Function naming conventions
<http://www.netlib.org/lapack/lug/node24.html>
The functions are named according to the pattern:
XYYZZZ
@ -108,6 +106,8 @@ Where:
- `ZZ`: computation to be done:
- `SV`: SolVe linear system
- `MM`: Matrix Multiply
- `LS`: Least Squares (overdetermined system)
## Sources
@ -118,3 +118,7 @@ Where:
- user's guide. algorithm info <http://www.netlib.org/lapack/lug/>
- <http://www.tat.physik.uni-tuebingen.de/~kley/lehre/ftn77/tutorial/blas.html>
## LAPACKE
<http://stackoverflow.com/questions/26875415/difference-between-lapacke-and-lapack>

View File

@ -14,103 +14,97 @@ interfaces provided by their respective projects, repectively through
#include <lapacke.h>
/**
* assert two integers are equal
* if not, print them to stderr and assert false
*/
void assert_eqi( int i1, int i2 )
{
if( i1 != i2 )
{
fprintf( stderr, "%d\n%d\n\n\n", i1, i2 );
assert( false );
* assert two integers are equal
* if not, print them to stderr and assert false
*/
void assert_eqi(int i1, int i2) {
if (i1 != i2) {
fprintf(stderr, "%d\n%d\n\n\n", i1, i2);
assert(false);
}
}
/*
* assert two doubles are equal within err precision
* if not, print them to stderr
*/
void assert_eqd( double d1, double d2, double err )
{
if( fabs( d1 - d2 ) > err )
{
fprintf( stderr, "%f\n%f\n\n\n", d1, d2 );
assert( false );
/**
* assert two doubles are equal within err precision
* if not, print them to stderr
*/
void assert_eqd(double d1, double d2, double err) {
if (fabs(d1 - d2) > err) {
fprintf(stderr, "%f\n%f\n\n\n", d1, d2);
assert(false);
}
}
/*
* print an array of doubles to stderr
*/
void print_vecd( int n, double * v )
{
/** print an array of doubles to stderr */
void print_vecd(int n, double * v) {
int i;
for(i=0; i<n; i++)
{
fprintf( stderr, "%.2f ", v[i] );
for (i=0; i<n; i++) {
fprintf(stderr, "%.2f ", v[i]);
}
printf("\n");
}
/*
bool eq_vecd2( int n, double * v1, double * v2, double err)
bool eq_vecd2(int n, double * v1, double * v2, double err)
{
if( fabs( d1 - d2 ) > err )
if(fabs(d1 - d2) > err)
return false
return true;
}
void assert_eqvd( int n, double * v1, double * v2, double err){
if( eq_vecd( n, v1, v2, err ) != true ){
print_vecd( n, v1 );
print_vecd( n, v2 );
void assert_eqvd(int n, double * v1, double * v2, double err){
if(eq_vecd(n, v1, v2, err) != true){
print_vecd(n, v1);
print_vecd(n, v2);
}
}
*/
int main(void)
{
int main(void) {
int info, ipiv2[2];
float err = 1e-6;
float x2[2], b2[2], c2[2];
float a2x2[2][2];
//#cblas
/* cblas */
{
x2[0] = 1.0;
x2[1] = -2.0;
assert_eqd( cblas_snrm2(2, x2, 1), sqrt(5.0), err );
assert_eqd(cblas_snrm2(2, x2, 1), sqrt(5.0), err);
}
//#lapacke
/* lapacke */
{
/*
sgesv
//1 2 x = 5
//3 4 y 11
Matrix vector multiply.
*/
{
a2x2[0][0] = 1.0;
a2x2[1][0] = 2.0;
a2x2[0][1] = 3.0;
a2x2[1][1] = 4.0;
b2[0] = 5.;
b2[1] = 11.;
//x = 1
//y = 1
a2x2[0][0] = 1.0;
a2x2[1][0] = 2.0;
a2x2[0][1] = 3.0;
a2x2[1][1] = 4.0;
b2[0] = 5.;
b2[1] = 11.;
info = LAPACKE_sgesv(
LAPACK_COL_MAJOR, //COL or ROW
2, //n
1, //nrhs
&a2x2[0][0],
2, //lda
&ipiv2[0],
&b2[0],
2 //ldb
);
c2[0] = 1.0;
c2[1] = 2.0;
assert_eqi( info, 0 );
assert_eqd( b2[0], c2[0], err );
assert_eqd( b2[1], c2[1], err );
info = LAPACKE_sgesv(
LAPACK_COL_MAJOR,
2,
1,
&a2x2[0][0],
2,
&ipiv2[0],
&b2[0],
2
);
c2[0] = 1.0;
c2[1] = 2.0;
assert_eqi(info, 0);
assert_eqd(b2[0], c2[0], err);
assert_eqd(b2[1], c2[1], err);
}
}
return EXIT_SUCCESS;
}

2
lapack/configure vendored
View File

@ -1,7 +1,7 @@
#!/usr/bin/env bash
sudo aptitude update
# BLAS C/Fotran and LAPACK Fortran:
sudo aptitude install -y liblapack-dev
sudo aptitude install -y liblapack-dev liblapacke-dev
# Lapack C via LAPACKE:
# TODO: how to install on Ubuntu 12.04?
# The following works only on later Ubuntu

View File

@ -171,7 +171,7 @@
!return status returned on `info`:
!pivots returned on `pivots`:
call sgesv( n, nrhs, a2x2, lda, pivots, b2, ldb, info )
c2(1) = 1.0
c2(2) = 2.0
call assert_eqi( info, 0 )

View File

@ -1 +1,3 @@
LIBS := -lm -pthread -lOpenCL
LIBS := -lm -pthread -lOpenCL -lblas
# blas from libatlas-base-dev
# cblas from libblas-dev

View File

@ -26,10 +26,12 @@ Articles:
#include "common.h"
#include <cblas.h>
#define VECTOR_SIZE (4 * sizeof(F))
typedef cl_float F;
typedef cl_float F4 __attribute__ ((vector_size(VECTOR_SIZE)));
typedef F F4 __attribute__ ((vector_size(VECTOR_SIZE)));
typedef void (*MatMul)(const F *A, const F *B, F *C, size_t n);
/* No, this was not created for debugging, my code is flawless from the first try. */
@ -166,31 +168,61 @@ void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n) {
mat_trans((F*)B, n);
}
size_t zmin(size_t x, size_t y) {
return (x < y) ? x : y;
}
/* Blocked matrix multiplication.
* Assumes n is power of 2.
* TODO slower than transpose, sometimes similar timing to naive.*/
* TODO slower than transpose, sometimes similar timing to naive.
* Why do they say that this is better? */
void mat_mul_cpu_block(const F *A, const F *B, F *C, size_t n) {
size_t nb = sqrt(n);
size_t ib, jb, i, j, i2, j2, k;
size_t ib, jb, kb, i, j, k, i_max, j_max, k_max, nb ;
F tmp;
for (ib = 0; ib < nb; ++ib) {
for (jb = 0; jb < nb; ++jb) {
for (i = 0; i < nb; ++i) {
for (j = 0; j < nb; ++j) {
i2 = nb * ib + i;
j2 = nb * jb + j;
tmp = 0.0;
for (k = 0; k < n; ++k) {
tmp += A[i2*n+k] * B[k*n+j2];
}
C[i2*n+j2] = tmp;
}
nb = lround(sqrt(n));
for (ib = 0; ib < n; ib += nb) {
i_max = zmin(ib + nb, n);
for (jb = 0; jb < n; jb += nb) {
k_max = zmin(jb + nb, n);
for (kb = 0; kb < n; kb += nb) {
j_max = zmin(kb + nb, n);
/* TODO would be cool to recurse here, but would require more offset parameters,
* likely similar to tose of CBLAS. */
for (i = ib; i < i_max; ++i) {
for (j = kb; j < j_max; ++j) {
tmp = 0.0;
for (k = jb; k < k_max; ++k) {
tmp += A[i*n+k] * B[k*n+j];
}
C[i*n+j] += tmp;
}
}
}
}
}
}
/* The golden single thread CPU standard. */
void mat_mul_cpu_cblas(const F *A, const F *B, F *C, size_t n) {
cblas_sgemm(
CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
n,
n,
n,
1.0,
A,
n,
B,
n,
1.0,
C,
n
);
}
/* Simplest possible CL implementation. No speedup. */
void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
cl_mem buf_a, buf_b, buf_c;
@ -261,18 +293,34 @@ void mat_mul_cl_row(const F *A, const F *B, F *C, size_t n) {
common_deinit(&common);
}
double bench(MatMul f, const F *A, const F *B, F *C, F *C_ref, size_t n) {
double dt, time;
mat_zero(C, n);
time = common_get_nanos();
f(A, B, C, n);
dt = common_get_nanos() - time;
if (C_ref != NULL)
assert(mat_eq(C, C_ref, n));
printf("%f ", dt);
return dt;
}
int main(int argc, char **argv) {
srand(time(NULL));
double max_cpu_runtime;
/* TODO stop being lazy and use this. */
/*MatMul mat_mul[] = {*/
/*mat_mul_cpu,*/
/*mat_mul_cpu_trans,*/
/*mat_mul_cpu_trans_vec,*/
/*mat_mul_cl,*/
/*mat_mul_cl_row,*/
/*};*/
MatMul mat_mul_funcs[] = {
mat_mul_cpu_trans,
mat_mul_cpu_trans_vec,
mat_mul_cpu_block,
mat_mul_cpu_cblas,
/* Comment out, because this overflows GPU memory and blocks computer
* before the others get to meaningful times. */
/*mat_mul_cl,*/
mat_mul_cl_row,
};
/* CLI args. */
if (argc > 1) {
max_cpu_runtime = strtod(argv[1], NULL);
} else {
@ -291,26 +339,34 @@ int main(int argc, char **argv) {
};
enum N { n = 2 };
F C[n*n];
const F C_expect[] = {
const F C_ref[] = {
19.0, 22.0,
43.0, 50.0
};
mat_zero(C, n);
mat_mul_cpu(A, B, C, n);
assert(mat_eq(C, C_expect, n));
assert(mat_eq(C, C_ref, n));
mat_zero(C, n);
mat_mul_cpu_trans(A, B, C, n);
assert(mat_eq(C, C_expect, n));
assert(mat_eq(C, C_ref, n));
mat_zero(C, n);
mat_mul_cpu_block(A, B, C, n);
assert(mat_eq(C, C_ref, n));
mat_zero(C, n);
mat_mul_cpu_cblas(A, B, C, n);
assert(mat_eq(C, C_ref, n));
mat_zero(C, n);
mat_mul_cl(A, B, C, n);
assert(mat_eq(C, C_expect, n));
assert(mat_eq(C, C_ref, n));
mat_zero(C, n);
mat_mul_cl_row(A, B, C, n);
assert(mat_eq(C, C_expect, n));
assert(mat_eq(C, C_ref, n));
}
/* Unit test for vector implementation, which requires 4x4 matrices. */
@ -327,7 +383,7 @@ int main(int argc, char **argv) {
25.0, 26.0, 27.0, 28.0,
29.0, 30.0, 31.0, 32.0,
};
const F C_expect[] = {
const F C_ref[] = {
250.000000, 260.000000, 270.000000, 280.000000,
618.000000, 644.000000, 670.000000, 696.000000,
986.000000, 1028.000000, 1070.000000, 1112.000000,
@ -336,23 +392,19 @@ int main(int argc, char **argv) {
enum N { n = 4 };
F C[n*n];
mat_zero(C, n);
mat_mul_cpu_block(A, B, C, n);
assert(mat_eq(C, C_expect, n));
mat_zero(C, n);
mat_mul_cpu_trans_vec(A, B, C, n);
assert(mat_eq(C, C_expect, n));
assert(mat_eq(C, C_ref, n));
}
/* Benchmarks. */
{
double dt;
F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL;
double dt, time;
size_t n = 4, a_sizeof;
size_t f, n = 4, a_sizeof;
puts("#matmul");
puts("n cpu cpu_trans cpu_trans_vec cpu_block cl_row");
puts("n cpu cpu_trans cpu_trans_vec cpu_block cpu_cblas cl_row");
a_sizeof = n * n * sizeof(F);
A = aligned_alloc(VECTOR_SIZE, a_sizeof);
B = aligned_alloc(VECTOR_SIZE, a_sizeof);
@ -361,43 +413,15 @@ int main(int argc, char **argv) {
while(1) {
printf("%zu ", n);
if (A == NULL || B == NULL || C == NULL) {
printf("Could not allocate memory for n = %zu", n);
printf("Could not allocate memory for n = %zu. Aborting.", n);
break;
}
mat_rand(A, n);
mat_rand(B, n);
time = common_get_nanos();
mat_mul_cpu(A, B, C_ref, n);
dt = common_get_nanos() - time;
printf("%f ", dt);
time = common_get_nanos();
mat_mul_cpu_trans(A, B, C, n);
assert(mat_eq(C, C_ref, n));
printf("%f ", common_get_nanos() - time);
time = common_get_nanos();
mat_mul_cpu_trans_vec(A, B, C, n);
assert(mat_eq(C, C_ref, n));
printf("%f ", common_get_nanos() - time);
time = common_get_nanos();
mat_mul_cpu_block(A, B, C, n);
assert(mat_eq(C, C_ref, n));
printf("%f ", common_get_nanos() - time);
/* Comment out, because this overflows GPU memory and blocks computer
* before the others get to meaningful times. */
/*time = common_get_nanos();*/
/*mat_mul_cl(A, B, C, n);*/
/*printf("%f ", common_get_nanos() - time);*/
time = common_get_nanos();
mat_mul_cl_row(A, B, C, n);
printf("%f ", common_get_nanos() - time);
assert(mat_eq(C, C_ref, n));
dt = bench(mat_mul_cpu, A, B, C_ref, NULL, n);
for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) {
bench(mat_mul_funcs[f], A, B, C, C_ref, n);
}
puts("");
if (dt > max_cpu_runtime)
break;