2 Copyright (c) 2005-2019 Intel Corporation
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
8 http://www.apache.org/licenses/LICENSE-2.0
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
24 #include "mkl_lapack.h"
27 #include "tbb/tbb_config.h"
28 #include "tbb/flow_graph.h"
29 #include "tbb/tick_count.h"
30 #include "tbb/task_scheduler_init.h"
32 // Application command line arguments parsing
33 #include "../../common/utility/utility.h"
35 /************************************************************
37 ************************************************************/
39 /**********************************************
40 Read or generate a positive-definite matrix
41 -- reads from file if fname != NULL
42 -- sets n to matrix size
43 -- allocates and reads values in to A
44 -- otherwise generates a matrix
45 -- uses n to determine size
46 -- allocates and generates values in to A
47 **********************************************/
48 void matrix_init( double * &A, int &n, const char *fname );
50 /**********************************************
51 Writes a lower triangular matrix to a file
52 -- first line of file is n
53 -- subsequently 1 row per line
54 **********************************************/
55 void matrix_write ( double *A, int n, const char *fname, bool is_triangular = false );
57 /************************************************************
59 ************************************************************/
60 bool g_benchmark_run = false;
61 int g_num_tbb_threads = tbb::task_scheduler_init::default_num_threads();
62 int g_n = -1, g_b = -1, g_num_trials = 1;
63 char *g_input_file_name = NULL;
64 char *g_output_prefix = NULL;
65 std::string g_alg_name;
67 // Creates tiled array
68 static double ***create_tile_array( double *A, int n, int b ) {
70 double ***tile = (double ***)calloc( sizeof( double ** ), p );
72 for ( int j = 0; j < p; ++j ) {
73 tile[j] = (double **)calloc( sizeof( double * ), p );
76 for ( int j = 0; j < p; ++j ) {
77 for ( int i = 0; i < p; ++i ) {
78 double *temp_block = (double *)calloc( sizeof( double ), b*b );
80 for ( int A_j = j*b, T_j = 0; T_j < b; ++A_j, ++T_j ) {
81 for ( int A_i = i*b, T_i = 0; T_i < b; ++A_i, ++T_i ) {
82 temp_block[T_j*b+T_i] = A[A_j*n+A_i];
86 tile[j][i] = temp_block;
92 static void collapse_tile_array( double ***tile, double *A, int n, int b ) {
95 for ( int j = 0; j < p; ++j ) {
96 for ( int i = 0; i < p; ++i ) {
97 double *temp_block = tile[j][i];
99 for ( int A_j = j*b, T_j = 0; T_j < b; ++A_j, ++T_j ) {
100 for ( int A_i = i*b, T_i = 0; T_i < b; ++A_i, ++T_i ) {
101 A[A_j*n+A_i] = temp_block[T_j*b+T_i];
115 /************************************************************
116 Helper base class: algorithm
117 ************************************************************/
123 bool check_if_valid( double *A0, double *C, double *A, int n ) {
124 char transa = 'n', transb = 't';
128 for ( int i = 0; i < n; ++i ) {
129 for ( int j = i+1; j < n; ++j ) {
134 dgemm ( &transa, &transb, &n, &n, &n, &alpha, A0, &n, A0, &n, &beta, C, &n );
136 for ( int j = 0; j < n; ++j ) {
137 for ( int i = 0; i < n; ++i ) {
138 const double epsilon = std::abs( A[j*n+i]*0.1 );
140 if ( std::abs( C[j*n+i] - A[j*n+i] ) > epsilon ) {
141 printf( "ERROR: %s did not validate at C(%d,%d) = %lf != A(%d,%d) = %lf\n",
142 name.c_str(), i, j, C[j*n+i], i, j, A[j*n+i] );
143 printf( "ERROR: %g; %g < %g < %g\n", epsilon, A[j*n+i] - epsilon, C[j*n+i], A[j*n+i] + epsilon );
152 algorithm( const std::string& alg_name, bool t ) : name(alg_name), is_tiled(t) {}
154 double operator() ( double *A, int n, int b, int trials ) {
155 tbb::tick_count t0, t1;
156 double elapsed_time = 0.0;
157 double *A0 = (double *)calloc( sizeof( double ), n*n );
158 double *C = (double *)calloc( sizeof( double ), n*n );
160 for ( int t = 0; t < trials+1; ++t ) {
162 double ***tile = create_tile_array( A, n, b );
163 t0 = tbb::tick_count::now();
165 t1 = tbb::tick_count::now();
167 collapse_tile_array( tile, A0, n, b );
170 memcpy( A0, A, sizeof( double )*n*n );
171 t0 = tbb::tick_count::now();
173 t1 = tbb::tick_count::now();
176 if ( t ) elapsed_time += (t1-t0).seconds();
178 if( !g_benchmark_run && !check_if_valid( A0, C, A, n ) ) {
179 if ( g_output_prefix ) {
180 std::string s( g_output_prefix );
181 s += "_" + name + ".txt";
182 matrix_write( A0, g_n, s.c_str(), true );
190 if ( g_output_prefix ) {
191 std::string s( g_output_prefix );
192 s += "_" + name + ".txt";
193 matrix_write( A0, g_n, s.c_str(), true );
196 printf( "%s %d %d %d %d %lf %lf\n", name.c_str(), g_num_tbb_threads, trials, n, b, elapsed_time, elapsed_time/trials );
203 // Main algorithm body function must be defined in any direved class
204 virtual void func( void * ptr, int n, int b ) = 0;
207 /***********************************************************/
209 static void call_dpotf2( double ***tile, int b, int k ) {
210 double *A_block = tile[k][k];
213 dpotf2( &uplo, &b, A_block, &b, &info );
217 static void call_dtrsm( double ***tile, int b, int k, int j ) {
218 double *A_block = tile[k][j];
219 double *L_block = tile[k][k];
220 char uplo = 'l', side = 'r', transa = 't', diag = 'n';
222 dtrsm( &side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b );
226 static void call_dsyr2k( double ***tile, int b, int k, int j, int i ) {
227 double *A_block = tile[i][j];
228 char transa = 'n', transb = 't';
233 if ( i == j ) { // Diagonal block
234 double *L_block = tile[k][i];
235 dsyrk( &uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b );
236 } else { // Non-diagonal block
237 double *L2_block = tile[k][i];
238 double *L1_block = tile[k][j];
239 dgemm( &transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b );
244 class algorithm_crout : public algorithm
247 algorithm_crout() : algorithm("crout_cholesky", true) {}
250 virtual void func( void * ptr, int n, int b ) {
251 double ***tile = (double ***)ptr;
254 for ( int k = 0; k < p; ++k ) {
255 call_dpotf2( tile, b, k );
257 for ( int j = k+1; j < p; ++j ) {
258 call_dtrsm( tile, b, k, j );
260 for ( int i = k+1; i <= j; ++i ) {
261 call_dsyr2k( tile, b, k, j, i );
268 class algorithm_dpotrf : public algorithm
271 algorithm_dpotrf() : algorithm("dpotrf_cholesky", false) {}
274 virtual void func( void * ptr, int n, int /* b */ ) {
275 double *A = (double *)ptr;
279 dpotrf( &uplo, &n, A, &lda, &info );
283 /************************************************************
284 Begin data join graph based version of cholesky
285 ************************************************************/
292 typedef double * tile_t;
294 typedef std::pair< tag_t, tile_t > tagged_tile_t;
295 typedef tbb::flow::tuple< tagged_tile_t > t1_t;
296 typedef tbb::flow::tuple< tagged_tile_t, tagged_tile_t > t2_t;
297 typedef tbb::flow::tuple< tagged_tile_t, tagged_tile_t, tagged_tile_t > t3_t;
299 typedef tbb::flow::multifunction_node< tagged_tile_t, t1_t > dpotf2_node_t;
300 typedef tbb::flow::multifunction_node< t2_t, t2_t > dtrsm_node_t;
301 typedef tbb::flow::multifunction_node< t3_t, t3_t > dsyr2k_node_t;
303 typedef tbb::flow::join_node< t2_t, tbb::flow::tag_matching > dtrsm_join_t;
304 typedef tbb::flow::join_node< t3_t, tbb::flow::tag_matching > dsyr2k_join_t;
310 dpotf2_body( int p_, int b_ ) : p(p_), b(b_) {}
312 void operator()( const tagged_tile_t &in, dpotf2_node_t::output_ports_type &ports ) {
313 int k = in.first.a[0];
314 tile_t A_block = in.second;
320 dpotf2( &uplo, &b, A_block, &b, &info );
322 // Send to dtrsms in same column
325 for ( int j = k+1; j < p; ++j ) {
327 tbb::flow::get<0>( ports ).try_put( std::make_pair( t, A_block ) );
336 dtrsm_body( int p_, int b_ ) : p(p_), b(b_) {}
338 void operator()( const t2_t &in, dtrsm_node_t::output_ports_type &ports ) {
339 using tbb::flow::get;
341 tagged_tile_t in0 = get<0>( in );
342 tagged_tile_t in1 = get<1>( in );
343 int k = in0.first.a[0];
344 int j = in0.first.a[1];
345 tile_t L_block = in0.second;
346 tile_t A_block = in1.second;
350 char uplo = 'l', side = 'r', transa = 't', diag = 'n';
352 dtrsm( &side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b);
354 // Send to rest of my row
356 for ( int i = k+1; i <= j; ++i ) {
358 get<0>( ports ).try_put( std::make_pair( t, A_block ) );
361 // Send to transposed row
363 for ( int i = j; i < p; ++i ) {
365 get<1>( ports ).try_put( std::make_pair( t, A_block ) );
374 dsyr2k_body( int p_, int b_ ) : p(p_), b(b_) {}
376 void operator()( const t3_t &in, dsyr2k_node_t::output_ports_type &ports ) {
377 using tbb::flow::get;
381 char transa = 'n', transb = 't';
386 tagged_tile_t in0 = get<0>( in );
387 tagged_tile_t in1 = get<1>( in );
388 tagged_tile_t in2 = get<2>( in );
389 int k = in2.first.a[0];
390 int j = in2.first.a[1];
391 int i = in2.first.a[2];
393 tile_t A_block = in2.second;
394 if ( i == j ) { // Diagonal block
395 tile_t L_block = in0.second;
396 dsyrk( &uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b );
397 } else { // Non-diagonal block
398 tile_t L1_block = in0.second;
399 tile_t L2_block = in1.second;
400 dgemm( &transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b );
403 // All outputs flow to next step
407 if ( k != p-1 && j == k+1 && i == k+1 ) {
408 get<0>( ports ).try_put( std::make_pair( t, A_block ) );
412 if ( i == k+1 && j > i ) {
415 get<1>( ports ).try_put( std::make_pair( t, A_block ) );
418 if ( j != k+1 && i != k+1 ) {
422 get<2>( ports ).try_put( std::make_pair( t, A_block ) );
428 struct tagged_tile_to_size_t {
429 size_t operator()( const tagged_tile_t &t ) {
434 class algorithm_join : public algorithm
437 algorithm_join() : algorithm("data_join_cholesky", true) {}
440 virtual void func( void * ptr, int n, int b ) {
441 using tbb::flow::unlimited;
442 using tbb::flow::output_port;
443 using tbb::flow::input_port;
445 double ***tile = (double ***)ptr;
449 dpotf2_node_t dpotf2_node( g, unlimited, dpotf2_body(p, b) );
450 dtrsm_node_t dtrsm_node( g, unlimited, dtrsm_body(p, b) );
451 dsyr2k_node_t dsyr2k_node( g, unlimited, dsyr2k_body(p, b) );
452 dtrsm_join_t dtrsm_join( g, tagged_tile_to_size_t(), tagged_tile_to_size_t() );
453 dsyr2k_join_t dsyr2k_join( g, tagged_tile_to_size_t(), tagged_tile_to_size_t(), tagged_tile_to_size_t() );
455 make_edge( output_port<0>( dsyr2k_node ), dpotf2_node );
457 make_edge( output_port<0>( dpotf2_node ), input_port<0>( dtrsm_join ) );
458 make_edge( output_port<1>( dsyr2k_node ), input_port<1>( dtrsm_join ) );
459 make_edge( dtrsm_join, dtrsm_node );
461 make_edge( output_port<0>( dtrsm_node ), input_port<0>( dsyr2k_join ) );
462 make_edge( output_port<1>( dtrsm_node ), input_port<1>( dsyr2k_join ) );
463 make_edge( output_port<2>( dsyr2k_node ), input_port<2>( dsyr2k_join ) );
464 make_edge( dsyr2k_join, dsyr2k_node );
466 // Now we need to send out the tiles to their first nodes
473 // Send to feedback input of first dpotf2
474 // k == 0, j == 0, i == 0
475 dpotf2_node.try_put( std::make_pair( t, tile[0][0] ) );
477 // Send to feedback input (port 1) of each dtrsm
478 // k == 0, j == 1..p-1
479 for ( int j = 1; j < p; ++j ) {
481 input_port<1>( dtrsm_join ).try_put( std::make_pair( t, tile[0][j] ) );
484 // Send to feedback input (port 2) of each dsyr2k
486 for ( int i = 1; i < p; ++i ) {
489 for ( int j = i; j < p; ++j ) {
491 input_port<2>( dsyr2k_join ).try_put( std::make_pair( t, tile[i][j] ) );
499 /************************************************************
500 End data join graph based version of cholesky
501 ************************************************************/
503 /************************************************************
504 Begin dependence graph based version of cholesky
505 ************************************************************/
507 typedef tbb::flow::continue_node< tbb::flow::continue_msg > continue_type;
508 typedef continue_type * continue_ptr_type;
510 #if !__TBB_CPP11_LAMBDAS_PRESENT
511 // Using helper functor classes (instead of built-in C++ 11 lambda functions)
512 class call_dpotf2_functor
517 call_dpotf2_functor( double ***tile_, int b_, int k_ )
518 : tile(tile_), b(b_), k(k_) {}
520 void operator()( const tbb::flow::continue_msg & ) { call_dpotf2( tile, b, k ); }
523 class call_dtrsm_functor
528 call_dtrsm_functor( double ***tile_, int b_, int k_, int j_ )
529 : tile(tile_), b(b_), k(k_), j(j_) {}
531 void operator()( const tbb::flow::continue_msg & ) { call_dtrsm( tile, b, k, j ); }
534 class call_dsyr2k_functor
539 call_dsyr2k_functor( double ***tile_, int b_, int k_, int j_, int i_ )
540 : tile(tile_), b(b_), k(k_), j(j_), i(i_) {}
542 void operator()( const tbb::flow::continue_msg & ) { call_dsyr2k( tile, b, k, j, i ); }
545 #endif // !__TBB_CPP11_LAMBDAS_PRESENT
547 class algorithm_depend : public algorithm
550 algorithm_depend() : algorithm("depend_cholesky", true) {}
553 virtual void func( void * ptr, int n, int b ) {
554 double ***tile = (double ***)ptr;
557 continue_ptr_type *c = new continue_ptr_type[p];
558 continue_ptr_type **t = new continue_ptr_type *[p];
559 continue_ptr_type ***u = new continue_ptr_type **[p];
562 for ( int k = p-1; k >= 0; --k ) {
563 c[k] = new continue_type( g,
564 #if __TBB_CPP11_LAMBDAS_PRESENT
565 [=]( const tbb::flow::continue_msg & ) { call_dpotf2( tile, b, k ); } );
567 call_dpotf2_functor( tile, b, k ) );
568 #endif // __TBB_CPP11_LAMBDAS_PRESENT
569 t[k] = new continue_ptr_type[p];
570 u[k] = new continue_ptr_type *[p];
572 for ( int j = k+1; j < p; ++j ) {
573 t[k][j] = new continue_type( g,
574 #if __TBB_CPP11_LAMBDAS_PRESENT
575 [=]( const tbb::flow::continue_msg & ) { call_dtrsm( tile, b, k, j ); } );
577 call_dtrsm_functor( tile, b, k, j ) );
578 #endif // __TBB_CPP11_LAMBDAS_PRESENT
579 make_edge( *c[k], *t[k][j] );
580 u[k][j] = new continue_ptr_type[p];
582 for ( int i = k+1; i <= j; ++i ) {
583 u[k][j][i] = new continue_type( g,
584 #if __TBB_CPP11_LAMBDAS_PRESENT
585 [=]( const tbb::flow::continue_msg & ) { call_dsyr2k( tile, b, k, j, i ); } );
587 call_dsyr2k_functor( tile, b, k, j, i ) );
588 #endif // __TBB_CPP11_LAMBDAS_PRESENT
590 if ( k < p-2 && k+1 != j && k+1 != i ) {
591 make_edge( *u[k][j][i], *u[k+1][j][i] );
594 make_edge( *t[k][j], *u[k][j][i] );
597 make_edge( *t[k][i], *u[k][j][i] );
600 if ( k < p-2 && j > i && i == k+1 ) {
601 make_edge( *u[k][j][i], *t[i][j] );
607 make_edge( *u[k][k+1][k+1], *c[k+1] );
611 c[0]->try_put( tbb::flow::continue_msg() );
614 }; // class algorithm_depend
616 /************************************************************
617 End dependence graph based version of cholesky
618 ************************************************************/
620 bool process_args( int argc, char *argv[] ) {
621 utility::parse_cli_arguments( argc, argv,
622 utility::cli_argument_pack()
623 //"-h" option for displaying help is present implicitly
624 .positional_arg( g_n, "size", "the row/column size of NxN matrix (size <= 46000)" )
625 .positional_arg( g_b, "blocksize", "the block size; size must be a multiple of the blocksize" )
626 .positional_arg( g_num_trials, "num_trials", "the number of times to run each algorithm" )
627 .positional_arg( g_output_prefix, "output_prefix",
628 "if provided the prefix will be preappended to output files:\n"
629 " output_prefix_posdef.txt\n"
630 " output_prefix_X.txt; where X is the algorithm used\n"
631 " if output_prefix is not provided, no output will be written" )
632 .positional_arg( g_alg_name, "algorithm", "name of the used algorithm - can be dpotrf, crout, depend or join" )
633 .positional_arg( g_num_tbb_threads, "num_tbb_threads", "number of started TBB threads" )
635 .arg( g_input_file_name, "input_file", "if provided it will be read to get the input matrix" )
636 .arg( g_benchmark_run, "-x", "skips all validation" )
640 printf( "ERROR: invalid 'size' value (must be less or equal 46000): %d\n", g_n );
644 if ( g_n%g_b != 0 ) {
645 printf( "ERROR: size %d must be a multiple of the blocksize %d\n", g_n, g_b );
649 if ( g_n/g_b > 256 ) {
650 // Because tile index size is 1 byte only in tag_t type
651 printf( "ERROR: size / blocksize must be less or equal 256, but %d / %d = %d\n", g_n, g_b, g_n/g_b );
655 if ( g_b == -1 || (g_n == -1 && g_input_file_name == NULL) ) {
662 int main(int argc, char *argv[]) {
663 typedef std::map< std::string, algorithm * > algmap_t;
667 algmap.insert(std::pair<std::string, algorithm *>("dpotrf", new algorithm_dpotrf));
668 algmap.insert(std::pair<std::string, algorithm *>("crout", new algorithm_crout));
669 algmap.insert(std::pair<std::string, algorithm *>("depend", new algorithm_depend));
670 algmap.insert(std::pair<std::string, algorithm *>("join", new algorithm_join));
672 if ( !process_args( argc, argv ) ) {
673 printf( "ERROR: Invalid arguments. Run: %s -h\n", argv[0] );
677 tbb::task_scheduler_init init( g_num_tbb_threads );
681 matrix_init( A, g_n, g_input_file_name );
683 // Write input matrix if output_prefix is set and we didn't read from a file
684 if ( !g_input_file_name && g_output_prefix ) {
685 std::string s( g_output_prefix );
687 matrix_write( A, g_n, s.c_str() );
690 if ( g_alg_name.empty() ) {
691 for ( algmap_t::iterator i = algmap.begin(); i != algmap.end(); ++i ) {
692 algorithm* const alg = i->second;
693 (*alg)( A, g_n, g_b, g_num_trials );
697 algmap_t::iterator alg_iter = algmap.find(g_alg_name);
699 if ( alg_iter != algmap.end() ) {
700 algorithm* const alg = alg_iter->second;
701 (*alg)( A, g_n, g_b, g_num_trials );
704 printf( "ERROR: Invalid algorithm name: %s\n", g_alg_name.c_str() );