From 84b8170bfbc954154d3cb34af473801b59c5e28a Mon Sep 17 00:00:00 2001 From: jiahaipeng Date: Sun, 11 Dec 2016 09:09:50 +0000 Subject: [PATCH] Adding multi-threading for copy, dot, rot, and asum funcitons --- interface/asum.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ interface/copy.c | 50 ++++++++++++++++++++++++ interface/dot.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++ interface/rot.c | 41 ++++++++++++++++++++ 4 files changed, 304 insertions(+) diff --git a/interface/asum.c b/interface/asum.c index 1393989..e6fcf4d 100644 --- a/interface/asum.c +++ b/interface/asum.c @@ -42,6 +42,24 @@ #include "functable.h" #endif +#ifdef SMP +static int asum_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, + float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) +{ +#ifndef CBLAS + FLOATRET ret; + ret = (FLOATRET)ASUM_K(m, x, incx); + *((double *)z) = (double)ret; +#else + FLOAT ret; + ret = ASUM_K(m, x, incx); + *((double *)z) = (double)ret; +#endif + + return 0; +} +#endif + #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ @@ -52,14 +70,62 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ PRINT_DEBUG_NAME; +#ifdef SMP + int i; + int mode, nthreads; + double mid_result= 0.0; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + double *buffer = (double*)blas_memory_alloc(0); +#endif + if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); +#ifdef SMP + nthreads = num_cpu_avail(1); + + //Temporarily work-around the low performance issue with small imput size & + //multithreads. + if (n <= 100000) + nthreads = 1; + + if (nthreads == 1) { +#endif + ret = (FLOATRET)ASUM_K(n, x, incx); +#ifdef SMP + } else { + +#ifndef DOUBLE +#ifndef COMPLEX + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#else +#ifndef COMPLEX + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, + x, incx, NULL, 0, buffer, 0, (void *)asum_threads, nthreads); + + for(i = 0; i < nthreads; i++) + mid_result += buffer[2*i]; + + ret = (FLOATRET)mid_result; + } + + blas_memory_free(buffer); +#endif + FUNCTION_PROFILE_END(COMPSIZE, n, n); IDEBUG_END; @@ -75,18 +141,68 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ PRINT_DEBUG_CNAME; +#ifdef SMP + int i; + int mode, nthreads; + double mid_result= 0.0; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + + double *buffer = (double*)blas_memory_alloc(0); +#endif + if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); +#ifdef SMP + nthreads = num_cpu_avail(1); + + //Temporarily work-around the low performance issue with small imput size & + //multithreads. + if (n <= 100000) + nthreads = 1; + + if (nthreads == 1) { +#endif + ret = ASUM_K(n, x, incx); +#ifdef SMP + } else { + +#ifndef DOUBLE +#ifndef COMPLEX + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#else +#ifndef COMPLEX + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, + x, incx, NULL, 0, buffer, 0, (void *)asum_threads, nthreads); + + for(i = 0; i < nthreads; i++) + mid_result += buffer[2*i]; + + ret = (FLOAT)mid_result; + } + + blas_memory_free(buffer); +#endif + FUNCTION_PROFILE_END(COMPSIZE, n, n); IDEBUG_END; + return ret; } diff --git a/interface/copy.c b/interface/copy.c index 3fb2182..7452c58 100644 --- a/interface/copy.c +++ b/interface/copy.c @@ -42,6 +42,17 @@ #include "functable.h" #endif +#ifdef SMP + +static int copy_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, + float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) +{ + COPY_K(m, x, incx, y, incy); + return 0; +} + +#endif + #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -60,6 +71,11 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #endif +#ifdef SMP + int mode, nthreads; + FLOAT dummyalpha[2] = {ZERO, ZERO}; +#endif + if (n <= 0) return; IDEBUG_START; @@ -69,8 +85,42 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ if (incx < 0) x -= (n - 1) * incx * COMPSIZE; if (incy < 0) y -= (n - 1) * incy * COMPSIZE; +#ifdef SMP + nthreads = num_cpu_avail(1); + + //Temporarily work-around the low performance issue with small imput size & + //multithreads. + if (n <= 100000) + nthreads = 1; + + if (nthreads == 1) { +#endif + COPY_K(n, x, incx, y, incy); +#ifdef SMP + } else { + +#ifndef DOUBLE +#ifndef COMPLEX + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#else +#ifndef COMPLEX + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif +#endif + + blas_level1_thread(mode, n, 0, 0, dummyalpha, + x, incx, y, incy, NULL, 0, (void *)copy_threads, nthreads); + + } +#endif + FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0); IDEBUG_END; diff --git a/interface/dot.c b/interface/dot.c index 3a91840..1ef9b34 100644 --- a/interface/dot.c +++ b/interface/dot.c @@ -42,6 +42,24 @@ #include "functable.h" #endif +#ifdef SMP +static int dot_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, + float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) +{ +#ifndef CBLAS + FLOATRET ret; + ret = (FLOATRET)DOTU_K(m, x, incx, y, incy); + *((double *)z) = (double)ret; +#else + FLOAT ret; + ret = DOTU_K(n, x, incx, y, incy); + *((double *)z) = (double)ret; +#endif + + return 0; +} +#endif + #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -53,6 +71,14 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ PRINT_DEBUG_NAME; +#ifdef SMP + int i; + int mode, nthreads; + double mid_result= 0.0; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + double *buffer = (double*)blas_memory_alloc(0); +#endif + if (n <= 0) return 0.; IDEBUG_START; @@ -62,8 +88,40 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; +#ifdef SMP + nthreads = num_cpu_avail(1); + + //Temporarily work-around the low performance issue with small imput size & + //multithreads. + if (n <= 100000) + nthreads = 1; + + if (nthreads == 1) { +#endif + ret = (FLOATRET)DOTU_K(n, x, incx, y, incy); +#ifdef SMP + } else { + +#ifndef DOUBLE + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, + x, incx, y, incy, buffer, 0, (void *)dot_threads, nthreads); + + for(i = 0; i < nthreads; i++) + mid_result += buffer[2*i]; + + ret = (FLOATRET)mid_result; + } + + blas_memory_free(buffer); +#endif + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; @@ -79,6 +137,14 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ PRINT_DEBUG_CNAME; +#ifdef SMP + int i; + int mode, nthreads; + double mid_result= 0.0; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + + double *buffer = (double*)blas_memory_alloc(0); +#endif if (n <= 0) return 0.; IDEBUG_START; @@ -88,8 +154,39 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; +#ifdef SMP + nthreads = num_cpu_avail(1); + + //Temporarily work-around the low performance issue with small imput size & + //multithreads. + if (n <= 100000) + nthreads = 1; + + if (nthreads == 1) { +#endif ret = DOTU_K(n, x, incx, y, incy); +#ifdef SMP + } else { + +#ifndef DOUBLE + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, + x, incx, y, incy, buffer, 0, (void *)dot_threads, nthreads); + + for(i = 0; i < nthreads; i++) + mid_result += buffer[2*i]; + + ret = (FLOAT)mid_result; + } + + blas_memory_free(buffer); +#endif + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; diff --git a/interface/rot.c b/interface/rot.c index 125275a..ae6b49c 100644 --- a/interface/rot.c +++ b/interface/rot.c @@ -42,6 +42,16 @@ #include "functable.h" #endif +#ifdef SMP +static int rot_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, + float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) +{ + ROT_K(m, x, incx, y, incy, n, k); + return 0; +} + +#endif + #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ @@ -62,6 +72,11 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, F #endif +#ifdef SMP + int mode, nthreads; + FLOAT dummyalpha[2] = {ZERO, ZERO}; +#endif + if (n <= 0) return; IDEBUG_START; @@ -71,8 +86,34 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, F if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; +#ifdef SMP + nthreads = num_cpu_avail(1); + + //Temporarily work-around the low performance issue with small imput size & + //multithreads. + if (n <= 100000) + nthreads = 1; + + if (nthreads == 1) { +#endif + ROT_K(n, x, incx, y, incy, c, s); +#ifdef SMP + } else { + +#ifndef DOUBLE + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, c, s, dummyalpha, + x, incx, y, incy, NULL, 0, (void *)rot_threads, nthreads); + + } +#endif + FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; -- 2.7.4