1 /*****************************************************************************
2 Copyright (c) 2011-2014, The OpenBLAS Project
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
9 1. Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in
14 the documentation and/or other materials provided with the
16 3. Neither the name of the OpenBLAS project nor the names of
17 its contributors may be used to endorse or promote products
18 derived from this software without specific prior written
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 **********************************************************************************/
34 /*********************************************************************/
35 /* Copyright 2009, 2010 The University of Texas at Austin. */
36 /* All rights reserved. */
38 /* Redistribution and use in source and binary forms, with or */
39 /* without modification, are permitted provided that the following */
40 /* conditions are met: */
42 /* 1. Redistributions of source code must retain the above */
43 /* copyright notice, this list of conditions and the following */
46 /* 2. Redistributions in binary form must reproduce the above */
47 /* copyright notice, this list of conditions and the following */
48 /* disclaimer in the documentation and/or other materials */
49 /* provided with the distribution. */
51 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
52 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
53 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
54 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
55 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
56 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
57 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
58 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
59 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
60 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
61 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
62 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
63 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
64 /* POSSIBILITY OF SUCH DAMAGE. */
66 /* The views and conclusions contained in the software and */
67 /* documentation are those of the authors and should not be */
68 /* interpreted as representing official policies, either expressed */
69 /* or implied, of The University of Texas at Austin. */
70 /*********************************************************************/
73 #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
76 #include <sys/resource.h>
82 #define likely(x) __builtin_expect(!!(x), 1)
89 #define unlikely(x) __builtin_expect(!!(x), 0)
91 #define unlikely(x) (x)
95 extern unsigned int openblas_thread_timeout();
102 #undef NEED_STACKATTR
104 #define ATTRIBUTE_SIZE 128
106 /* This is a thread server model implementation. The threads are */
107 /* spawned at first access to blas library, and still remains until */
108 /* destruction routine is called. The number of threads are */
109 /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
110 /* jobs is queued. */
112 /* We need this grobal for cheking if initialization is finished. */
113 int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
115 /* Local Variables */
116 #if defined(USE_PTHREAD_LOCK)
117 static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
118 #elif defined(USE_PTHREAD_SPINLOCK)
119 static pthread_spinlock_t server_lock = 0;
121 static unsigned long server_lock = 0;
124 #define THREAD_STATUS_SLEEP 2
125 #define THREAD_STATUS_WAKEUP 4
127 static pthread_t blas_threads [MAX_CPU_NUMBER];
130 blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE)));
132 #if defined(OS_LINUX) && !defined(NO_AFFINITY)
136 volatile long status;
138 pthread_mutex_t lock;
139 pthread_cond_t wakeup;
143 static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE)));
145 #ifndef THREAD_TIMEOUT
146 #define THREAD_TIMEOUT 28
149 static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
153 /* Monitor is a function to see thread's status for every seconds. */
154 /* Usually it turns off and it's for debugging. */
156 static pthread_t monitor_thread;
157 static int main_status[MAX_CPU_NUMBER];
158 #define MAIN_ENTER 0x01
159 #define MAIN_EXIT 0x02
160 #define MAIN_TRYLOCK 0x03
161 #define MAIN_LOCKSUCCESS 0x04
162 #define MAIN_QUEUING 0x05
163 #define MAIN_RECEIVING 0x06
164 #define MAIN_RUNNING1 0x07
165 #define MAIN_RUNNING2 0x08
166 #define MAIN_RUNNING3 0x09
167 #define MAIN_WAITING 0x0a
168 #define MAIN_SLEEPING 0x0b
169 #define MAIN_FINISH 0x0c
170 #define MAIN_DONE 0x0d
173 #define BLAS_QUEUE_FINISHED 3
174 #define BLAS_QUEUE_RUNNING 4
177 BLASLONG exit_time[MAX_CPU_NUMBER];
180 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
182 if (!(mode & BLAS_COMPLEX)){
184 if (mode & BLAS_XDOUBLE){
185 /* REAL / Extended Double */
186 void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
187 xdouble *, BLASLONG, xdouble *, BLASLONG,
188 xdouble *, BLASLONG, void *) = func;
190 afunc(args -> m, args -> n, args -> k,
191 ((xdouble *)args -> alpha)[0],
192 args -> a, args -> lda,
193 args -> b, args -> ldb,
194 args -> c, args -> ldc, sb);
197 if (mode & BLAS_DOUBLE){
199 void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
200 double *, BLASLONG, double *, BLASLONG,
201 double *, BLASLONG, void *) = func;
203 afunc(args -> m, args -> n, args -> k,
204 ((double *)args -> alpha)[0],
205 args -> a, args -> lda,
206 args -> b, args -> ldb,
207 args -> c, args -> ldc, sb);
210 void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
211 float *, BLASLONG, float *, BLASLONG,
212 float *, BLASLONG, void *) = func;
214 afunc(args -> m, args -> n, args -> k,
215 ((float *)args -> alpha)[0],
216 args -> a, args -> lda,
217 args -> b, args -> ldb,
218 args -> c, args -> ldc, sb);
222 if (mode & BLAS_XDOUBLE){
223 /* COMPLEX / Extended Double */
224 void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
225 xdouble *, BLASLONG, xdouble *, BLASLONG,
226 xdouble *, BLASLONG, void *) = func;
228 afunc(args -> m, args -> n, args -> k,
229 ((xdouble *)args -> alpha)[0],
230 ((xdouble *)args -> alpha)[1],
231 args -> a, args -> lda,
232 args -> b, args -> ldb,
233 args -> c, args -> ldc, sb);
236 if (mode & BLAS_DOUBLE){
237 /* COMPLEX / Double */
238 void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
239 double *, BLASLONG, double *, BLASLONG,
240 double *, BLASLONG, void *) = func;
242 afunc(args -> m, args -> n, args -> k,
243 ((double *)args -> alpha)[0],
244 ((double *)args -> alpha)[1],
245 args -> a, args -> lda,
246 args -> b, args -> ldb,
247 args -> c, args -> ldc, sb);
249 /* COMPLEX / Single */
250 void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
251 float *, BLASLONG, float *, BLASLONG,
252 float *, BLASLONG, void *) = func;
254 afunc(args -> m, args -> n, args -> k,
255 ((float *)args -> alpha)[0],
256 ((float *)args -> alpha)[1],
257 args -> a, args -> lda,
258 args -> b, args -> ldb,
259 args -> c, args -> ldc, sb);
264 #if defined(OS_LINUX) && !defined(NO_AFFINITY)
265 int gotoblas_set_affinity(int);
266 int gotoblas_set_affinity2(int);
270 static int increased_threads = 0;
272 static void* blas_thread_server(void *arg){
274 /* Thread identifier */
275 BLASLONG cpu = (BLASLONG)arg;
276 unsigned int last_tick;
277 void *buffer, *sa, *sb;
283 unsigned long start, stop;
286 #if defined(OS_LINUX) && !defined(NO_AFFINITY)
287 if (!increased_threads)
288 thread_status[cpu].node = gotoblas_set_affinity(cpu + 1);
290 thread_status[cpu].node = gotoblas_set_affinity(-1);
294 main_status[cpu] = MAIN_ENTER;
297 buffer = blas_memory_alloc(2);
300 fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
306 main_status[cpu] = MAIN_QUEUING;
310 exit_time[cpu] = rpcc();
313 last_tick = (unsigned int)rpcc();
315 pthread_mutex_lock (&thread_status[cpu].lock);
316 tscq=thread_status[cpu].queue;
317 pthread_mutex_unlock (&thread_status[cpu].lock);
322 if ((unsigned int)rpcc() - last_tick > thread_timeout) {
324 pthread_mutex_lock (&thread_status[cpu].lock);
326 if (!thread_status[cpu].queue) {
327 thread_status[cpu].status = THREAD_STATUS_SLEEP;
328 while (thread_status[cpu].status == THREAD_STATUS_SLEEP) {
331 main_status[cpu] = MAIN_SLEEPING;
334 pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
338 pthread_mutex_unlock(&thread_status[cpu].lock);
340 last_tick = (unsigned int)rpcc();
342 pthread_mutex_lock (&thread_status[cpu].lock);
343 tscq=thread_status[cpu].queue;
344 pthread_mutex_unlock (&thread_status[cpu].lock);
348 queue = thread_status[cpu].queue;
350 if ((long)queue == -1) break;
353 main_status[cpu] = MAIN_RECEIVING;
361 int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
363 pthread_mutex_lock (&thread_status[cpu].lock);
364 thread_status[cpu].queue = (blas_queue_t *)1;
365 pthread_mutex_unlock (&thread_status[cpu].lock);
372 fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
373 cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
377 #ifdef CONSISTENT_FPCSR
378 __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
379 __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
383 main_status[cpu] = MAIN_RUNNING1;
386 if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
389 if (!(queue -> mode & BLAS_COMPLEX)){
391 if (queue -> mode & BLAS_XDOUBLE){
392 sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
393 + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
396 if (queue -> mode & BLAS_DOUBLE){
397 sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
398 + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
401 sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
402 + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
406 if (queue -> mode & BLAS_XDOUBLE){
407 sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
408 + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
411 if (queue -> mode & BLAS_DOUBLE){
412 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
413 + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
415 sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
416 + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
423 main_status[cpu] = MAIN_RUNNING2;
426 if (queue -> mode & BLAS_LEGACY) {
427 legacy_exec(routine, queue -> mode, queue -> args, sb);
429 if (queue -> mode & BLAS_PTHREAD) {
430 void (*pthreadcompat)(void *) = queue -> routine;
431 (pthreadcompat)(queue -> args);
433 (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
436 fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
440 main_status[cpu] = MAIN_FINISH;
443 // arm: make sure all results are written out _before_
444 // thread is marked as done and other threads use them
447 pthread_mutex_lock (&thread_status[cpu].lock);
448 thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
449 pthread_mutex_unlock (&thread_status[cpu].lock);
456 main_status[cpu] = MAIN_DONE;
462 fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1,
469 /* Shutdown procedure */
472 fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
475 blas_memory_free(buffer);
477 //pthread_exit(NULL);
484 static BLASLONG num_suspend = 0;
486 static int blas_monitor(void *arg){
490 for (i = 0; i < blas_num_threads - 1; i++){
491 switch (main_status[i]) {
493 fprintf(STDERR, "THREAD[%2d] : Entering.\n", i);
496 fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i);
499 fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i);
502 fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i);
504 case MAIN_RECEIVING :
505 fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i);
508 fprintf(STDERR, "THREAD[%2d] : Running1.\n", i);
511 fprintf(STDERR, "THREAD[%2d] : Running2.\n", i);
514 fprintf(STDERR, "THREAD[%2d] : Running3.\n", i);
517 fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i);
520 fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i);
523 fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i);
526 fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i);
530 fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend);
539 /* Initializing routine */
540 int blas_thread_init(void){
543 int thread_timeout_env;
544 #ifdef NEED_STACKATTR
548 if (blas_server_avail) return 0;
550 #ifdef NEED_STACKATTR
551 pthread_attr_init(&attr);
552 pthread_attr_setguardsize(&attr, 0x1000U);
553 pthread_attr_setstacksize( &attr, 0x1000U);
556 LOCK_COMMAND(&server_lock);
558 if (!blas_server_avail){
560 thread_timeout_env=openblas_thread_timeout();
561 if (thread_timeout_env>0) {
562 if (thread_timeout_env < 4) thread_timeout_env = 4;
563 if (thread_timeout_env > 30) thread_timeout_env = 30;
564 thread_timeout = (1 << thread_timeout_env);
567 for(i = 0; i < blas_num_threads - 1; i++){
569 thread_status[i].queue = (blas_queue_t *)NULL;
570 thread_status[i].status = THREAD_STATUS_WAKEUP;
572 pthread_mutex_init(&thread_status[i].lock, NULL);
573 pthread_cond_init (&thread_status[i].wakeup, NULL);
575 #ifdef NEED_STACKATTR
576 ret=pthread_create(&blas_threads[i], &attr,
577 &blas_thread_server, (void *)i);
579 ret=pthread_create(&blas_threads[i], NULL,
580 &blas_thread_server, (void *)i);
584 const char *msg = strerror(ret);
585 fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
587 if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
588 fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
589 "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max));
592 if(0 != raise(SIGINT)) {
593 fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n");
600 pthread_create(&monitor_thread, NULL,
601 (void *)&blas_monitor, (void *)NULL);
604 blas_server_avail = 1;
607 UNLOCK_COMMAND(&server_lock);
613 User can call one of two routines.
615 exec_blas_async ... immediately returns after jobs are queued.
617 exec_blas ... returns after jobs are finished.
620 static BLASULONG exec_queue_lock = 0;
622 int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
625 // Handle lazy re-init of the thread-pool after a POSIX fork
626 if (unlikely(blas_server_avail == 0)) blas_thread_init();
629 blas_queue_t *current = queue;
630 blas_queue_t *tsiq,*tspq;
631 #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
632 int node = get_node();
633 int nodes = get_num_nodes();
638 fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos);
641 blas_lock(&exec_queue_lock);
644 queue -> position = pos;
646 #ifdef CONSISTENT_FPCSR
647 __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
648 __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
651 #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
653 /* Node Mapping Mode */
655 if (queue -> mode & BLAS_NODE) {
658 while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++;
660 if (i < blas_num_threads - 1) break;
663 if (i >= blas_num_threads - 1) {
666 if (node >= nodes) node = 0;
672 pthread_mutex_lock (&thread_status[i].lock);
673 tsiq = thread_status[i].queue;
674 pthread_mutex_unlock (&thread_status[i].lock);
677 if (i >= blas_num_threads - 1) i = 0;
678 pthread_mutex_lock (&thread_status[i].lock);
679 tsiq = thread_status[i].queue;
680 pthread_mutex_unlock (&thread_status[i].lock);
684 pthread_mutex_lock (&thread_status[i].lock);
685 tsiq=thread_status[i].queue ;
686 pthread_mutex_unlock (&thread_status[i].lock);
689 if (i >= blas_num_threads - 1) i = 0;
690 pthread_mutex_lock (&thread_status[i].lock);
691 tsiq=thread_status[i].queue ;
692 pthread_mutex_unlock (&thread_status[i].lock);
696 queue -> assigned = i;
698 pthread_mutex_lock (&thread_status[i].lock);
699 thread_status[i].queue = queue;
700 pthread_mutex_unlock (&thread_status[i].lock);
703 queue = queue -> next;
711 blas_unlock(&exec_queue_lock);
714 fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count);
719 pos = current -> assigned;
721 pthread_mutex_lock (&thread_status[pos].lock);
722 tspq=thread_status[pos].queue;
723 pthread_mutex_unlock (&thread_status[pos].lock);
725 if ((BLASULONG)tspq > 1) {
726 pthread_mutex_lock (&thread_status[pos].lock);
728 if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
735 if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
736 thread_status[pos].status = THREAD_STATUS_WAKEUP;
737 pthread_cond_signal(&thread_status[pos].wakeup);
741 pthread_mutex_unlock(&thread_status[pos].lock);
744 current = current -> next;
750 int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
753 while ((num > 0) && queue) {
755 pthread_mutex_lock(&thread_status[queue->assigned].lock);
756 tsqq=thread_status[queue -> assigned].queue;
757 pthread_mutex_unlock(&thread_status[queue->assigned].lock);
762 pthread_mutex_lock(&thread_status[queue->assigned].lock);
763 tsqq=thread_status[queue -> assigned].queue;
764 pthread_mutex_unlock(&thread_status[queue->assigned].lock);
769 queue = queue -> next;
774 fprintf(STDERR, "Done.\n\n");
780 /* Execute Threads */
781 int exec_blas(BLASLONG num, blas_queue_t *queue){
784 // Handle lazy re-init of the thread-pool after a POSIX fork
785 if (unlikely(blas_server_avail == 0)) blas_thread_init();
787 int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
790 BLASULONG start, stop;
793 if ((num <= 0) || (queue == NULL)) return 0;
796 fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
800 if (omp_in_parallel && (num > 1)) {
801 if (omp_in_parallel() > 0) {
803 "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. "
804 "Please rebuild the library with USE_OPENMP=1 option.\n");
809 if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
814 fprintf(STDERR, "\n");
817 routine = queue -> routine;
819 if (queue -> mode & BLAS_LEGACY) {
820 legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
822 if (queue -> mode & BLAS_PTHREAD) {
823 void (*pthreadcompat)(void *) = queue -> routine;
824 (pthreadcompat)(queue -> args);
826 (routine)(queue -> args, queue -> range_m, queue -> range_n,
827 queue -> sa, queue -> sb, 0);
833 if ((num > 1) && queue -> next) {
834 exec_blas_async_wait(num - 1, queue -> next);
836 // arm: make sure results from other threads are visible
841 fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n",
849 void goto_set_num_threads(int num_threads) {
853 if (num_threads < 1) num_threads = blas_num_threads;
856 if (num_threads == 1) {
857 if (blas_cpu_number == 1){
858 //OpenBLAS is already single thread.
861 //From multi-threads to single thread
862 //Restore the original affinity mask
863 gotoblas_set_affinity(-1);
868 if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
870 if (num_threads > blas_num_threads) {
872 LOCK_COMMAND(&server_lock);
874 increased_threads = 1;
876 for(i = blas_num_threads - 1; i < num_threads - 1; i++){
878 thread_status[i].queue = (blas_queue_t *)NULL;
879 thread_status[i].status = THREAD_STATUS_WAKEUP;
881 pthread_mutex_init(&thread_status[i].lock, NULL);
882 pthread_cond_init (&thread_status[i].wakeup, NULL);
884 #ifdef NEED_STACKATTR
885 pthread_create(&blas_threads[i], &attr,
886 &blas_thread_server, (void *)i);
888 pthread_create(&blas_threads[i], NULL,
889 &blas_thread_server, (void *)i);
893 blas_num_threads = num_threads;
895 UNLOCK_COMMAND(&server_lock);
899 if(blas_cpu_number == 1 && num_threads > 1){
900 //Restore the thread 0 affinity.
901 gotoblas_set_affinity(0);
905 blas_cpu_number = num_threads;
907 #if defined(ARCH_MIPS64)
908 //set parameters for different number of threads.
909 blas_set_parameter();
914 void openblas_set_num_threads(int num_threads) {
915 goto_set_num_threads(num_threads);
919 /* Compatible function with pthread_create / join */
921 int gotoblas_pthread(int numthreads, void *function, void *args, int stride) {
923 blas_queue_t queue[MAX_CPU_NUMBER];
926 if (numthreads <= 0) return 0;
929 if (blas_cpu_number == 0) blas_get_cpu_number();
931 if (blas_server_avail == 0) blas_thread_init();
935 for (i = 0; i < numthreads; i ++) {
937 queue[i].mode = BLAS_PTHREAD;
938 queue[i].routine = function;
939 queue[i].args = args;
940 queue[i].range_m = NULL;
941 queue[i].range_n = NULL;
944 queue[i].next = &queue[i + 1];
949 queue[numthreads - 1].next = NULL;
951 exec_blas(numthreads, queue);
956 /* Shutdown procedure, but user don't have to call this routine. The */
957 /* kernel automatically kill threads. */
959 int BLASFUNC(blas_thread_shutdown)(void){
963 if (!blas_server_avail) return 0;
965 LOCK_COMMAND(&server_lock);
967 for (i = 0; i < blas_num_threads - 1; i++) {
969 pthread_mutex_lock (&thread_status[i].lock);
971 thread_status[i].queue = (blas_queue_t *)-1;
973 thread_status[i].status = THREAD_STATUS_WAKEUP;
975 pthread_cond_signal (&thread_status[i].wakeup);
977 pthread_mutex_unlock(&thread_status[i].lock);
981 for(i = 0; i < blas_num_threads - 1; i++){
982 pthread_join(blas_threads[i], NULL);
985 for(i = 0; i < blas_num_threads - 1; i++){
986 pthread_mutex_destroy(&thread_status[i].lock);
987 pthread_cond_destroy (&thread_status[i].wakeup);
990 #ifdef NEED_STACKATTR
991 pthread_attr_destory(&attr);
994 blas_server_avail = 0;
996 UNLOCK_COMMAND(&server_lock);