From be18cd47f6223cbe3d7fc45fcecec1035dd4d1db Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:52:48 +0100 Subject: [PATCH] changed level3.c --- driver/level3/level3.c | 26 ++-------------- driver/level3/level3_thread.c | 70 +------------------------------------------ 2 files changed, 3 insertions(+), 93 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index d87c5f5..5f74664 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -36,8 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -// #define TIMING 1 - /* This file is a template for level 3 operation */ #ifndef BETA_OPERATION @@ -335,24 +333,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#elif defined(ARMV7) - if (min_jj >= 32) min_jj = 32; - else - if (min_jj >= 16) min_jj = 16; - else - if (min_jj >= 8) min_jj = 8; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - #else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -412,15 +400,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; -#ifdef ARMV7 - - printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", - innercost / total * 100., outercost / total * 100., - kernelcost / total * 100.); - - -#else - printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., @@ -428,7 +407,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); #endif -#endif return 0; } diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 56c4d6e..ee1a8db 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -36,8 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -// #define TIMING 1 - #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif @@ -235,21 +233,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG l1stride, l2size; #ifdef TIMING - -#ifdef ARMV7 - - unsigned long long rpcc_counter; - unsigned long long copy_A = 0; - unsigned long long copy_B = 0; - unsigned long long kernel = 0; - unsigned long long waiting1 = 0; - unsigned long long waiting2 = 0; - unsigned long long waiting3 = 0; - unsigned long long waiting6[MAX_CPU_NUMBER]; - unsigned long long ops = 0; - -#else - BLASULONG rpcc_counter; BLASULONG copy_A = 0; BLASULONG copy_B = 0; @@ -260,8 +243,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG ops = 0; -#endif - for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif @@ -339,35 +320,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_l = k - ls; -#ifdef ARMV7_1 - if (min_l >= GEMM_Q / 4 * 2) { - min_l = GEMM_Q / 4; - } else { - if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2; - } - -#else if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } -#endif l1stride = 1; min_i = m_to - m_from; -#ifdef ARMV7_1 - if (min_i >= GEMM_P / 4 * 2) { - min_i = GEMM_P / 4; - } else { - if (min_i > GEMM_P / 4) { - min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); - } else { - if (args -> nthreads == 1) l1stride = 0; - } - } -#else if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -378,8 +339,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } -#endif - START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); @@ -408,22 +367,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#elif defined(ARMV7) - if (min_jj >= 16) min_jj = 16; - else - if (min_jj >= 8) min_jj = 8; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - - #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -555,21 +504,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(waiting3); #ifdef TIMING - -#ifdef ARMV7 - - unsigned long long waiting = waiting1 + waiting2 + waiting3; - unsigned long long total = copy_A + copy_B + kernel + waiting; - - fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", - mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., - (double)waiting1 /(double)total * 100., - (double)waiting2 /(double)total * 100., - (double)waiting3 /(double)total * 100., - (double)kernel /(double)total * 100.); - -#else - BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; @@ -580,8 +514,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); -#endif - #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); -- 2.7.4