mirror of
https://github.com/libretro/cpp-cheat.git
synced 2025-04-13 00:30:27 +00:00
Matrix multiplication with cblas, block works now, reformat lapack cheat a bit
This commit is contained in:
parent
e4c287ac6d
commit
b57712c0af
@ -8,7 +8,7 @@ BLAS and LAPACK are:
|
||||
|
||||
- de-facto standards
|
||||
|
||||
- non-parallel
|
||||
- non-parallel
|
||||
|
||||
- originally written in Fortran
|
||||
|
||||
@ -30,8 +30,6 @@ BLAS contains low level functions such as:
|
||||
- vector matrix multiplication
|
||||
- matrix matrix multiplication
|
||||
|
||||
The BLAS project provides `cblas.h`, which contains a C interface for BLAS.
|
||||
|
||||
### LAPACK
|
||||
|
||||
LAPACK contains higher level functions such as:
|
||||
@ -61,21 +59,19 @@ Implements full BLAS, but only part of LAPACK.
|
||||
|
||||
Has C interface.
|
||||
|
||||
## Installation on Ubuntu
|
||||
## C interface
|
||||
|
||||
### Fortran
|
||||
The BLAS project provides `cblas.h`, which contains a C interface for BLAS (TODO but also an implementation?)
|
||||
|
||||
sudo aptitude install liblapack-dev liblapack-doc libblas-doc
|
||||
Via atlas:
|
||||
|
||||
### C interface
|
||||
sudo aptitude install libatlas-dev
|
||||
gcc -lcblas
|
||||
|
||||
via atlas:
|
||||
|
||||
sudo aptitude install
|
||||
|
||||
via LAPACKE (`libblas-dev` already contains `cblas.h`):
|
||||
Via LAPACKE (`libblas-dev` already contains `cblas.h`):
|
||||
|
||||
sudo aptitude install liblapacke-dev
|
||||
gcc -lblas
|
||||
|
||||
## Levels
|
||||
|
||||
@ -85,6 +81,8 @@ via LAPACKE (`libblas-dev` already contains `cblas.h`):
|
||||
|
||||
## Function naming conventions
|
||||
|
||||
<http://www.netlib.org/lapack/lug/node24.html>
|
||||
|
||||
The functions are named according to the pattern:
|
||||
|
||||
XYYZZZ
|
||||
@ -108,6 +106,8 @@ Where:
|
||||
- `ZZ`: computation to be done:
|
||||
|
||||
- `SV`: SolVe linear system
|
||||
- `MM`: Matrix Multiply
|
||||
- `LS`: Least Squares (overdetermined system)
|
||||
|
||||
## Sources
|
||||
|
||||
@ -118,3 +118,7 @@ Where:
|
||||
- user's guide. algorithm info <http://www.netlib.org/lapack/lug/>
|
||||
|
||||
- <http://www.tat.physik.uni-tuebingen.de/~kley/lehre/ftn77/tutorial/blas.html>
|
||||
|
||||
## LAPACKE
|
||||
|
||||
<http://stackoverflow.com/questions/26875415/difference-between-lapacke-and-lapack>
|
||||
|
126
lapack/c.c
126
lapack/c.c
@ -14,103 +14,97 @@ interfaces provided by their respective projects, repectively through
|
||||
#include <lapacke.h>
|
||||
|
||||
/**
|
||||
* assert two integers are equal
|
||||
* if not, print them to stderr and assert false
|
||||
*/
|
||||
void assert_eqi( int i1, int i2 )
|
||||
{
|
||||
if( i1 != i2 )
|
||||
{
|
||||
fprintf( stderr, "%d\n%d\n\n\n", i1, i2 );
|
||||
assert( false );
|
||||
* assert two integers are equal
|
||||
* if not, print them to stderr and assert false
|
||||
*/
|
||||
void assert_eqi(int i1, int i2) {
|
||||
if (i1 != i2) {
|
||||
fprintf(stderr, "%d\n%d\n\n\n", i1, i2);
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* assert two doubles are equal within err precision
|
||||
* if not, print them to stderr
|
||||
*/
|
||||
void assert_eqd( double d1, double d2, double err )
|
||||
{
|
||||
if( fabs( d1 - d2 ) > err )
|
||||
{
|
||||
fprintf( stderr, "%f\n%f\n\n\n", d1, d2 );
|
||||
assert( false );
|
||||
/**
|
||||
* assert two doubles are equal within err precision
|
||||
* if not, print them to stderr
|
||||
*/
|
||||
void assert_eqd(double d1, double d2, double err) {
|
||||
if (fabs(d1 - d2) > err) {
|
||||
fprintf(stderr, "%f\n%f\n\n\n", d1, d2);
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* print an array of doubles to stderr
|
||||
*/
|
||||
void print_vecd( int n, double * v )
|
||||
{
|
||||
/** print an array of doubles to stderr */
|
||||
void print_vecd(int n, double * v) {
|
||||
int i;
|
||||
for(i=0; i<n; i++)
|
||||
{
|
||||
fprintf( stderr, "%.2f ", v[i] );
|
||||
for (i=0; i<n; i++) {
|
||||
fprintf(stderr, "%.2f ", v[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/*
|
||||
bool eq_vecd2( int n, double * v1, double * v2, double err)
|
||||
bool eq_vecd2(int n, double * v1, double * v2, double err)
|
||||
{
|
||||
if( fabs( d1 - d2 ) > err )
|
||||
if(fabs(d1 - d2) > err)
|
||||
return false
|
||||
return true;
|
||||
}
|
||||
|
||||
void assert_eqvd( int n, double * v1, double * v2, double err){
|
||||
if( eq_vecd( n, v1, v2, err ) != true ){
|
||||
print_vecd( n, v1 );
|
||||
print_vecd( n, v2 );
|
||||
void assert_eqvd(int n, double * v1, double * v2, double err){
|
||||
if(eq_vecd(n, v1, v2, err) != true){
|
||||
print_vecd(n, v1);
|
||||
print_vecd(n, v2);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int main(void) {
|
||||
int info, ipiv2[2];
|
||||
float err = 1e-6;
|
||||
float x2[2], b2[2], c2[2];
|
||||
float a2x2[2][2];
|
||||
|
||||
//#cblas
|
||||
|
||||
/* cblas */
|
||||
{
|
||||
x2[0] = 1.0;
|
||||
x2[1] = -2.0;
|
||||
assert_eqd( cblas_snrm2(2, x2, 1), sqrt(5.0), err );
|
||||
assert_eqd(cblas_snrm2(2, x2, 1), sqrt(5.0), err);
|
||||
}
|
||||
|
||||
//#lapacke
|
||||
/* lapacke */
|
||||
{
|
||||
/*
|
||||
sgesv
|
||||
|
||||
//1 2 x = 5
|
||||
//3 4 y 11
|
||||
Matrix vector multiply.
|
||||
*/
|
||||
{
|
||||
a2x2[0][0] = 1.0;
|
||||
a2x2[1][0] = 2.0;
|
||||
a2x2[0][1] = 3.0;
|
||||
a2x2[1][1] = 4.0;
|
||||
b2[0] = 5.;
|
||||
b2[1] = 11.;
|
||||
|
||||
//x = 1
|
||||
//y = 1
|
||||
|
||||
a2x2[0][0] = 1.0;
|
||||
a2x2[1][0] = 2.0;
|
||||
a2x2[0][1] = 3.0;
|
||||
a2x2[1][1] = 4.0;
|
||||
b2[0] = 5.;
|
||||
b2[1] = 11.;
|
||||
|
||||
info = LAPACKE_sgesv(
|
||||
LAPACK_COL_MAJOR, //COL or ROW
|
||||
2, //n
|
||||
1, //nrhs
|
||||
&a2x2[0][0],
|
||||
2, //lda
|
||||
&ipiv2[0],
|
||||
&b2[0],
|
||||
2 //ldb
|
||||
);
|
||||
c2[0] = 1.0;
|
||||
c2[1] = 2.0;
|
||||
assert_eqi( info, 0 );
|
||||
assert_eqd( b2[0], c2[0], err );
|
||||
assert_eqd( b2[1], c2[1], err );
|
||||
info = LAPACKE_sgesv(
|
||||
LAPACK_COL_MAJOR,
|
||||
2,
|
||||
1,
|
||||
&a2x2[0][0],
|
||||
2,
|
||||
&ipiv2[0],
|
||||
&b2[0],
|
||||
2
|
||||
);
|
||||
c2[0] = 1.0;
|
||||
c2[1] = 2.0;
|
||||
assert_eqi(info, 0);
|
||||
assert_eqd(b2[0], c2[0], err);
|
||||
assert_eqd(b2[1], c2[1], err);
|
||||
}
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
2
lapack/configure
vendored
2
lapack/configure
vendored
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
sudo aptitude update
|
||||
# BLAS C/Fotran and LAPACK Fortran:
|
||||
sudo aptitude install -y liblapack-dev
|
||||
sudo aptitude install -y liblapack-dev liblapacke-dev
|
||||
# Lapack C via LAPACKE:
|
||||
# TODO: how to install on Ubuntu 12.04?
|
||||
# The following works only on later Ubuntu
|
||||
|
@ -171,7 +171,7 @@
|
||||
!return status returned on `info`:
|
||||
!pivots returned on `pivots`:
|
||||
call sgesv( n, nrhs, a2x2, lda, pivots, b2, ldb, info )
|
||||
|
||||
|
||||
c2(1) = 1.0
|
||||
c2(2) = 2.0
|
||||
call assert_eqi( info, 0 )
|
||||
|
@ -1 +1,3 @@
|
||||
LIBS := -lm -pthread -lOpenCL
|
||||
LIBS := -lm -pthread -lOpenCL -lblas
|
||||
# blas from libatlas-base-dev
|
||||
# cblas from libblas-dev
|
||||
|
168
opencl/matmul.c
168
opencl/matmul.c
@ -26,10 +26,12 @@ Articles:
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <cblas.h>
|
||||
|
||||
#define VECTOR_SIZE (4 * sizeof(F))
|
||||
|
||||
typedef cl_float F;
|
||||
typedef cl_float F4 __attribute__ ((vector_size(VECTOR_SIZE)));
|
||||
typedef F F4 __attribute__ ((vector_size(VECTOR_SIZE)));
|
||||
typedef void (*MatMul)(const F *A, const F *B, F *C, size_t n);
|
||||
|
||||
/* No, this was not created for debugging, my code is flawless from the first try. */
|
||||
@ -166,31 +168,61 @@ void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n) {
|
||||
mat_trans((F*)B, n);
|
||||
}
|
||||
|
||||
size_t zmin(size_t x, size_t y) {
|
||||
return (x < y) ? x : y;
|
||||
}
|
||||
|
||||
/* Blocked matrix multiplication.
|
||||
* Assumes n is power of 2.
|
||||
* TODO slower than transpose, sometimes similar timing to naive.*/
|
||||
* TODO slower than transpose, sometimes similar timing to naive.
|
||||
* Why do they say that this is better? */
|
||||
void mat_mul_cpu_block(const F *A, const F *B, F *C, size_t n) {
|
||||
size_t nb = sqrt(n);
|
||||
size_t ib, jb, i, j, i2, j2, k;
|
||||
size_t ib, jb, kb, i, j, k, i_max, j_max, k_max, nb ;
|
||||
F tmp;
|
||||
|
||||
for (ib = 0; ib < nb; ++ib) {
|
||||
for (jb = 0; jb < nb; ++jb) {
|
||||
for (i = 0; i < nb; ++i) {
|
||||
for (j = 0; j < nb; ++j) {
|
||||
i2 = nb * ib + i;
|
||||
j2 = nb * jb + j;
|
||||
tmp = 0.0;
|
||||
for (k = 0; k < n; ++k) {
|
||||
tmp += A[i2*n+k] * B[k*n+j2];
|
||||
}
|
||||
C[i2*n+j2] = tmp;
|
||||
}
|
||||
nb = lround(sqrt(n));
|
||||
for (ib = 0; ib < n; ib += nb) {
|
||||
i_max = zmin(ib + nb, n);
|
||||
for (jb = 0; jb < n; jb += nb) {
|
||||
k_max = zmin(jb + nb, n);
|
||||
for (kb = 0; kb < n; kb += nb) {
|
||||
j_max = zmin(kb + nb, n);
|
||||
|
||||
/* TODO would be cool to recurse here, but would require more offset parameters,
|
||||
* likely similar to tose of CBLAS. */
|
||||
for (i = ib; i < i_max; ++i) {
|
||||
for (j = kb; j < j_max; ++j) {
|
||||
tmp = 0.0;
|
||||
for (k = jb; k < k_max; ++k) {
|
||||
tmp += A[i*n+k] * B[k*n+j];
|
||||
}
|
||||
C[i*n+j] += tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* The golden single thread CPU standard. */
|
||||
void mat_mul_cpu_cblas(const F *A, const F *B, F *C, size_t n) {
|
||||
cblas_sgemm(
|
||||
CblasRowMajor,
|
||||
CblasNoTrans,
|
||||
CblasNoTrans,
|
||||
n,
|
||||
n,
|
||||
n,
|
||||
1.0,
|
||||
A,
|
||||
n,
|
||||
B,
|
||||
n,
|
||||
1.0,
|
||||
C,
|
||||
n
|
||||
);
|
||||
}
|
||||
|
||||
/* Simplest possible CL implementation. No speedup. */
|
||||
void mat_mul_cl(const F *A, const F *B, F *C, size_t n) {
|
||||
cl_mem buf_a, buf_b, buf_c;
|
||||
@ -261,18 +293,34 @@ void mat_mul_cl_row(const F *A, const F *B, F *C, size_t n) {
|
||||
common_deinit(&common);
|
||||
}
|
||||
|
||||
|
||||
double bench(MatMul f, const F *A, const F *B, F *C, F *C_ref, size_t n) {
|
||||
double dt, time;
|
||||
mat_zero(C, n);
|
||||
time = common_get_nanos();
|
||||
f(A, B, C, n);
|
||||
dt = common_get_nanos() - time;
|
||||
if (C_ref != NULL)
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
printf("%f ", dt);
|
||||
return dt;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
srand(time(NULL));
|
||||
double max_cpu_runtime;
|
||||
/* TODO stop being lazy and use this. */
|
||||
/*MatMul mat_mul[] = {*/
|
||||
/*mat_mul_cpu,*/
|
||||
/*mat_mul_cpu_trans,*/
|
||||
/*mat_mul_cpu_trans_vec,*/
|
||||
/*mat_mul_cl,*/
|
||||
/*mat_mul_cl_row,*/
|
||||
/*};*/
|
||||
MatMul mat_mul_funcs[] = {
|
||||
mat_mul_cpu_trans,
|
||||
mat_mul_cpu_trans_vec,
|
||||
mat_mul_cpu_block,
|
||||
mat_mul_cpu_cblas,
|
||||
/* Comment out, because this overflows GPU memory and blocks computer
|
||||
* before the others get to meaningful times. */
|
||||
/*mat_mul_cl,*/
|
||||
mat_mul_cl_row,
|
||||
};
|
||||
|
||||
/* CLI args. */
|
||||
if (argc > 1) {
|
||||
max_cpu_runtime = strtod(argv[1], NULL);
|
||||
} else {
|
||||
@ -291,26 +339,34 @@ int main(int argc, char **argv) {
|
||||
};
|
||||
enum N { n = 2 };
|
||||
F C[n*n];
|
||||
const F C_expect[] = {
|
||||
const F C_ref[] = {
|
||||
19.0, 22.0,
|
||||
43.0, 50.0
|
||||
};
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cpu(A, B, C, n);
|
||||
assert(mat_eq(C, C_expect, n));
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cpu_trans(A, B, C, n);
|
||||
assert(mat_eq(C, C_expect, n));
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cpu_block(A, B, C, n);
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cpu_cblas(A, B, C, n);
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cl(A, B, C, n);
|
||||
assert(mat_eq(C, C_expect, n));
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cl_row(A, B, C, n);
|
||||
assert(mat_eq(C, C_expect, n));
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
}
|
||||
|
||||
/* Unit test for vector implementation, which requires 4x4 matrices. */
|
||||
@ -327,7 +383,7 @@ int main(int argc, char **argv) {
|
||||
25.0, 26.0, 27.0, 28.0,
|
||||
29.0, 30.0, 31.0, 32.0,
|
||||
};
|
||||
const F C_expect[] = {
|
||||
const F C_ref[] = {
|
||||
250.000000, 260.000000, 270.000000, 280.000000,
|
||||
618.000000, 644.000000, 670.000000, 696.000000,
|
||||
986.000000, 1028.000000, 1070.000000, 1112.000000,
|
||||
@ -336,23 +392,19 @@ int main(int argc, char **argv) {
|
||||
enum N { n = 4 };
|
||||
F C[n*n];
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cpu_block(A, B, C, n);
|
||||
assert(mat_eq(C, C_expect, n));
|
||||
|
||||
mat_zero(C, n);
|
||||
mat_mul_cpu_trans_vec(A, B, C, n);
|
||||
assert(mat_eq(C, C_expect, n));
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
}
|
||||
|
||||
/* Benchmarks. */
|
||||
{
|
||||
double dt;
|
||||
F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL;
|
||||
double dt, time;
|
||||
size_t n = 4, a_sizeof;
|
||||
size_t f, n = 4, a_sizeof;
|
||||
|
||||
puts("#matmul");
|
||||
puts("n cpu cpu_trans cpu_trans_vec cpu_block cl_row");
|
||||
puts("n cpu cpu_trans cpu_trans_vec cpu_block cpu_cblas cl_row");
|
||||
a_sizeof = n * n * sizeof(F);
|
||||
A = aligned_alloc(VECTOR_SIZE, a_sizeof);
|
||||
B = aligned_alloc(VECTOR_SIZE, a_sizeof);
|
||||
@ -361,43 +413,15 @@ int main(int argc, char **argv) {
|
||||
while(1) {
|
||||
printf("%zu ", n);
|
||||
if (A == NULL || B == NULL || C == NULL) {
|
||||
printf("Could not allocate memory for n = %zu", n);
|
||||
printf("Could not allocate memory for n = %zu. Aborting.", n);
|
||||
break;
|
||||
}
|
||||
mat_rand(A, n);
|
||||
mat_rand(B, n);
|
||||
|
||||
time = common_get_nanos();
|
||||
mat_mul_cpu(A, B, C_ref, n);
|
||||
dt = common_get_nanos() - time;
|
||||
printf("%f ", dt);
|
||||
|
||||
time = common_get_nanos();
|
||||
mat_mul_cpu_trans(A, B, C, n);
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
printf("%f ", common_get_nanos() - time);
|
||||
|
||||
time = common_get_nanos();
|
||||
mat_mul_cpu_trans_vec(A, B, C, n);
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
printf("%f ", common_get_nanos() - time);
|
||||
|
||||
time = common_get_nanos();
|
||||
mat_mul_cpu_block(A, B, C, n);
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
printf("%f ", common_get_nanos() - time);
|
||||
|
||||
/* Comment out, because this overflows GPU memory and blocks computer
|
||||
* before the others get to meaningful times. */
|
||||
/*time = common_get_nanos();*/
|
||||
/*mat_mul_cl(A, B, C, n);*/
|
||||
/*printf("%f ", common_get_nanos() - time);*/
|
||||
|
||||
time = common_get_nanos();
|
||||
mat_mul_cl_row(A, B, C, n);
|
||||
printf("%f ", common_get_nanos() - time);
|
||||
assert(mat_eq(C, C_ref, n));
|
||||
|
||||
dt = bench(mat_mul_cpu, A, B, C_ref, NULL, n);
|
||||
for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) {
|
||||
bench(mat_mul_funcs[f], A, B, C, C_ref, n);
|
||||
}
|
||||
puts("");
|
||||
if (dt > max_cpu_runtime)
|
||||
break;
|
||||
|
Loading…
x
Reference in New Issue
Block a user