1 // run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
12 // number of loop for a 1x1 matrix. Lower it if the test is
13 // too slow on you computer.
19 void (* bench_func)();
21 void * (* create_matrix)(int size);
24 void * s_create_matrix(int size) {
25 float * r = malloc(size * sizeof(double));
27 for(i = 0; i < size; i++)
28 r[i] = 1e3 * i / size;
32 void * c_create_matrix(int size) {
33 float * r = malloc(size * 2 * sizeof(double));
35 for(i = 0; i < 2 * size; i++)
36 r[i] = 1e3 * i / size;
40 void * z_create_matrix(int size) {
41 double * r = malloc(size * 2 * sizeof(double));
43 for(i = 0; i < 2 * size; i++)
44 r[i] = 1e3 * i / size;
48 void * d_create_matrix(int size) {
49 double * r = malloc(size * sizeof(double));
51 for(i = 0; i < size; i++)
52 r[i] = 1e3 * i / size;
56 void trmv_bench(BenchParam * param)
59 int size = param->matrix_size;
60 n = param->n_loop / size;
62 void * A = param->create_matrix(size * size);
63 void * y = param->create_matrix(size);
64 for(i = 0; i < n; i++) {
65 param->blas_func("U", "N", "N", &size, A, &size, y, &one);
71 void gemv_bench(BenchParam * param)
74 int size = param->matrix_size;
75 n = param->n_loop / size;
78 void * A = param->create_matrix(size * size);
79 void * y = param->create_matrix(size);
80 for(i = 0; i < n; i++) {
81 param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
87 void ger_bench(BenchParam * param) {
89 int size = param->matrix_size;
90 n = param->n_loop / size;
93 void * A = param->create_matrix(size * size);
94 void * y = param->create_matrix(size);
95 for(i = 0; i < n; i++) {
96 param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
103 void * pthread_func_wrapper(void * param) {
104 ((BenchParam *)param)->bench_func(param);
110 void * TESTS[4 * NB_TESTS] = {
111 trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
112 gemv_bench, dgemv_, d_create_matrix, "dgemv",
113 gemv_bench, zgemv_, z_create_matrix, "zgemv",
114 ger_bench, dger_, d_create_matrix, "dger",
115 ger_bench, zgerc_, z_create_matrix, "zgerc",
118 inline static double delta_time(struct timespec tick) {
119 struct timespec tock;
120 clock_gettime(CLOCK_MONOTONIC, &tock);
121 return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
124 double pthread_bench(BenchParam * param, int nb_threads)
129 BenchParam threaded_param = *param;
130 pthread_t threads[nb_threads];
132 struct timespec tick;
133 threaded_param.n_loop /= nb_threads;
134 clock_gettime(CLOCK_MONOTONIC, &tick);
135 for(t=0; t<nb_threads; t++){
136 rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
138 printf("ERROR; return code from pthread_create() is %d\n", rc);
142 for(t=0; t<nb_threads; t++){
143 pthread_join(threads[t], NULL);
145 return delta_time(tick);
149 double seq_bench(BenchParam * param) {
150 struct timespec tick;
151 clock_gettime(CLOCK_MONOTONIC, &tick);
152 param->bench_func(param);
153 return delta_time(tick);
156 double omp_bench(BenchParam * param) {
157 BenchParam threaded_param = *param;
158 struct timespec tick;
160 int nb_threads = omp_get_max_threads();
161 threaded_param.n_loop /= nb_threads;
162 clock_gettime(CLOCK_MONOTONIC, &tick);
163 #pragma omp parallel for
164 for(t = 0; t < nb_threads; t ++){
165 param->bench_func(&threaded_param);
167 return delta_time(tick);
170 int main(int argc, char * argv[]) {
171 double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
174 printf ("Running on %d threads\n", omp_get_max_threads());
175 for(test_id = 0; test_id < NB_TESTS; test_id ++) {
176 double size = MIN_SIZE;
177 param.bench_func = TESTS[test_id * 4];
178 param.blas_func = TESTS[test_id * 4 + 1];
179 param.create_matrix = TESTS[test_id * 4 + 2];
180 printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
181 param.n_loop = NLOOP;
182 while(size <= MAX_SIZE) {
183 param.matrix_size = (int)(size + 0.5);
184 double seq_time = seq_bench(¶m);
185 double omp_time = omp_bench(¶m);
186 double pthread_time = pthread_bench(¶m, omp_get_max_threads());
187 printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
188 "pthread %gs, speedup %g\n",
189 param.matrix_size, seq_time,
190 omp_time, seq_time / omp_time,
191 pthread_time, seq_time / pthread_time);