--- /dev/null
+// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <cblas.h>
+#include <omp.h>
+#define MIN_SIZE 5
+#define MAX_SIZE 60
+#define NB_SIZE 10
+
+// number of loop for a 1x1 matrix. Lower it if the test is
+// too slow on you computer.
+#define NLOOP 2e7
+
+typedef struct {
+ int matrix_size;
+ int n_loop;
+ void (* bench_func)();
+ void (* blas_func)();
+ void * (* create_matrix)(int size);
+} BenchParam;
+
+void * s_create_matrix(int size) {
+ float * r = malloc(size * sizeof(double));
+ for(int i = 0; i < size; i++)
+ r[i] = 1e3 * i / size;
+ return r;
+}
+
+void * c_create_matrix(int size) {
+ float * r = malloc(size * 2 * sizeof(double));
+ for(int i = 0; i < 2 * size; i++)
+ r[i] = 1e3 * i / size;
+ return r;
+}
+
+void * z_create_matrix(int size) {
+ double * r = malloc(size * 2 * sizeof(double));
+ for(int i = 0; i < 2 * size; i++)
+ r[i] = 1e3 * i / size;
+ return r;
+}
+
+void * d_create_matrix(int size) {
+ double * r = malloc(size * sizeof(double));
+ for(int i = 0; i < size; i++)
+ r[i] = 1e3 * i / size;
+ return r;
+}
+
+void trmv_bench(BenchParam * param)
+{
+ int i, n;
+ int size = param->matrix_size;
+ n = param->n_loop / size;
+ int one = 1;
+ void * A = param->create_matrix(size * size);
+ void * y = param->create_matrix(size);
+ for(i = 0; i < n; i++) {
+ param->blas_func("U", "N", "N", &size, A, &size, y, &one);
+ }
+ free(A);
+ free(y);
+}
+
+void gemv_bench(BenchParam * param)
+{
+ int i, n;
+ int size = param->matrix_size;
+ n = param->n_loop / size;
+ double v = 1.01;
+ int one = 1;
+ void * A = param->create_matrix(size * size);
+ void * y = param->create_matrix(size);
+ for(i = 0; i < n; i++) {
+ param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
+ }
+ free(A);
+ free(y);
+}
+
+void ger_bench(BenchParam * param) {
+ int i, n;
+ int size = param->matrix_size;
+ n = param->n_loop / size;
+ double v = 1.01;
+ int one = 1;
+ void * A = param->create_matrix(size * size);
+ void * y = param->create_matrix(size);
+ for(i = 0; i < n; i++) {
+ param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
+ }
+ free(A);
+ free(y);
+}
+
+#ifndef _WIN32
+void * pthread_func_wrapper(void * param) {
+ ((BenchParam *)param)->bench_func(param);
+ pthread_exit(NULL);
+}
+#endif
+
+#define NB_TESTS 5
+void * TESTS[4 * NB_TESTS] = {
+ trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
+ gemv_bench, dgemv_, d_create_matrix, "dgemv",
+ gemv_bench, zgemv_, z_create_matrix, "zgemv",
+ ger_bench, dger_, d_create_matrix, "dger",
+ ger_bench, zgerc_, z_create_matrix, "zgerc",
+};
+
+inline static double delta_time(struct timespec tick) {
+ struct timespec tock;
+ clock_gettime(CLOCK_MONOTONIC, &tock);
+ return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
+}
+
+double pthread_bench(BenchParam * param, int nb_threads)
+{
+#ifdef _WIN32
+ return 0;
+#else
+ BenchParam threaded_param = *param;
+ pthread_t threads[nb_threads];
+ int t, rc;
+ struct timespec tick;
+ threaded_param.n_loop /= nb_threads;
+ clock_gettime(CLOCK_MONOTONIC, &tick);
+ for(t=0; t<nb_threads; t++){
+ rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
+ if (rc){
+ printf("ERROR; return code from pthread_create() is %d\n", rc);
+ exit(-1);
+ }
+ }
+ for(t=0; t<nb_threads; t++){
+ pthread_join(threads[t], NULL);
+ }
+ return delta_time(tick);
+#endif
+}
+
+double seq_bench(BenchParam * param) {
+ struct timespec tick;
+ clock_gettime(CLOCK_MONOTONIC, &tick);
+ param->bench_func(param);
+ return delta_time(tick);
+}
+
+double omp_bench(BenchParam * param) {
+ BenchParam threaded_param = *param;
+ struct timespec tick;
+ int t;
+ int nb_threads = omp_get_max_threads();
+ threaded_param.n_loop /= nb_threads;
+ clock_gettime(CLOCK_MONOTONIC, &tick);
+ #pragma omp parallel for
+ for(t = 0; t < nb_threads; t ++){
+ param->bench_func(&threaded_param);
+ }
+ return delta_time(tick);
+}
+
+int main(int argc, char * argv[]) {
+ double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
+ BenchParam param;
+ int test_id;
+ printf ("Running on %d threads\n", omp_get_max_threads());
+ for(test_id = 0; test_id < NB_TESTS; test_id ++) {
+ double size = MIN_SIZE;
+ param.bench_func = TESTS[test_id * 4];
+ param.blas_func = TESTS[test_id * 4 + 1];
+ param.create_matrix = TESTS[test_id * 4 + 2];
+ printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
+ param.n_loop = NLOOP;
+ while(size <= MAX_SIZE) {
+ param.matrix_size = (int)(size + 0.5);
+ double seq_time = seq_bench(¶m);
+ double omp_time = omp_bench(¶m);
+ double pthread_time = pthread_bench(¶m, omp_get_max_threads());
+ printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
+ "pthread %gs, speedup %g\n",
+ param.matrix_size, seq_time,
+ omp_time, seq_time / omp_time,
+ pthread_time, seq_time / pthread_time);
+ size *= inc_factor;
+ }
+ }
+}