From 5400a9f4e4c30d7a961983f733b81b607d09ba2d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 10:34:04 +0100 Subject: [PATCH] redefined functions for TIMING and YIELDING for ARMV7 processor --- common.h | 8 ++++++ common_arm.h | 8 ++++-- driver/level3/level3.c | 22 ++++++++++++++- driver/level3/level3_thread.c | 66 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/common.h b/common.h index 418ed25..a277552 100644 --- a/common.h +++ b/common.h @@ -310,10 +310,18 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif + +#ifdef ARMV7 +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif + + /*** To alloc job_t on heap or statck. please https://github.com/xianyi/OpenBLAS/issues/246 diff --git a/common_arm.h b/common_arm.h index e3d1d40..8c9752d 100644 --- a/common_arm.h +++ b/common_arm.h @@ -104,11 +104,13 @@ static void __inline blas_lock(volatile BLASULONG *address){ } -static inline BLASULONG rpcc(void){ - BLASULONG ret=0; +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; struct timeval tv; gettimeofday(&tv,NULL); - ret=1000000* tv.tv_sec + tv.tv_usec; + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); return ret; } diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe8895..d87c5f5 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -36,6 +36,8 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +// #define TIMING 1 + /* This file is a template for level 3 operation */ #ifndef BETA_OPERATION @@ -341,8 +343,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#else +#elif defined(ARMV7) + if (min_jj >= 32) min_jj = 32; + else + if (min_jj >= 16) min_jj = 16; + else + if (min_jj >= 8) min_jj = 8; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -402,6 +412,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; +#ifdef ARMV7 + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", + innercost / total * 100., outercost / total * 100., + kernelcost / total * 100.); + + +#else + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., @@ -409,6 +428,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); #endif +#endif return 0; } diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790..56c4d6e 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -36,6 +36,8 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +// #define TIMING 1 + #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif @@ -233,6 +235,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG l1stride, l2size; #ifdef TIMING + +#ifdef ARMV7 + + unsigned long long rpcc_counter; + unsigned long long copy_A = 0; + unsigned long long copy_B = 0; + unsigned long long kernel = 0; + unsigned long long waiting1 = 0; + unsigned long long waiting2 = 0; + unsigned long long waiting3 = 0; + unsigned long long waiting6[MAX_CPU_NUMBER]; + unsigned long long ops = 0; + +#else + BLASULONG rpcc_counter; BLASULONG copy_A = 0; BLASULONG copy_B = 0; @@ -243,6 +260,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG ops = 0; +#endif + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif @@ -320,15 +339,35 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_l = k - ls; +#ifdef ARMV7_1 + if (min_l >= GEMM_Q / 4 * 2) { + min_l = GEMM_Q / 4; + } else { + if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2; + } + +#else if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } +#endif l1stride = 1; min_i = m_to - m_from; +#ifdef ARMV7_1 + if (min_i >= GEMM_P / 4 * 2) { + min_i = GEMM_P / 4; + } else { + if (min_i > GEMM_P / 4) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } else { + if (args -> nthreads == 1) l1stride = 0; + } + } +#else if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -339,6 +378,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } +#endif + START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); @@ -375,6 +416,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#elif defined(ARMV7) + if (min_jj >= 16) min_jj = 16; + else + if (min_jj >= 8) min_jj = 8; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -506,6 +555,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(waiting3); #ifdef TIMING + +#ifdef ARMV7 + + unsigned long long waiting = waiting1 + waiting2 + waiting3; + unsigned long long total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting1 /(double)total * 100., + (double)waiting2 /(double)total * 100., + (double)waiting3 /(double)total * 100., + (double)kernel /(double)total * 100.); + +#else + BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; @@ -516,6 +580,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); +#endif + #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); -- 2.7.4