1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
44 /* Assume C declarations for C++ */
45 #endif /* __cplusplus */
60 #include "config_kernel.h"
65 #undef ENABLE_SSE_EXCEPTION
67 #if defined(SMP_SERVER) || defined(SMP_ONDEMAND)
71 #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
76 #define DOUBLE_DEFINED DOUBLE
81 #if !defined(NOINCLUDE) && !defined(ASSEMBLER)
86 #if !defined(_MSC_VER)
89 #define snprintf _snprintf
98 #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
104 //Android NDK only supports complex.h since Android 5.0
105 #if __ANDROID_API__ < 21
106 #define FORCE_OPENBLAS_COMPLEX_STRUCT
116 #define GOTO_ATOM ATOM
122 #define ATOM GOTO_ATOM
125 #elif !defined(OS_EMBEDDED)
126 #include <sys/mman.h>
130 #include <sys/time.h>
134 #if defined(SMP) || defined(USE_LOCKING)
142 #if defined(OS_SUNOS)
148 #include <machine/builtins.h>
151 #if defined(ARCH_IA64) && defined(ENABLE_SSE_EXCEPTION)
157 #if defined(OS_WINDOWS) && defined(DOUBLE_DEFINED)
158 #define DOUBLE DOUBLE_DEFINED
159 #undef DOUBLE_DEFINED
165 #undef SMP_ALLOC_DEBUG
187 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
189 #define ALLOCA_ALIGN 63UL
191 #define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
193 #ifdef NEEDBUNDERSCORE
194 #define BLASFUNC(FUNC) FUNC##_
196 #define BLASFUNC(FUNC) FUNC
199 #undef USE_PTHREAD_LOCK
200 #undef USE_PTHREAD_SPINLOCK
202 #if defined(USE_PTHREAD_LOCK) && defined(USE_PTHREAD_SPINLOCK)
203 #error "You can't specify both LOCK operation!"
206 #if defined(SMP) || defined(USE_LOCKING)
207 #define USE_PTHREAD_LOCK
208 #undef USE_PTHREAD_SPINLOCK
212 #undef USE_PTHREAD_LOCK
213 #undef USE_PTHREAD_SPINLOCK
216 #if defined(USE_PTHREAD_LOCK)
217 #define LOCK_COMMAND(x) pthread_mutex_lock(x)
218 #define UNLOCK_COMMAND(x) pthread_mutex_unlock(x)
219 #elif defined(USE_PTHREAD_SPINLOCK)
221 typedef volatile int pthread_spinlock_t;
222 int pthread_spin_lock (pthread_spinlock_t *__lock);
223 int pthread_spin_unlock (pthread_spinlock_t *__lock);
225 #define LOCK_COMMAND(x) pthread_spin_lock(x)
226 #define UNLOCK_COMMAND(x) pthread_spin_unlock(x)
228 #define LOCK_COMMAND(x) blas_lock(x)
229 #define UNLOCK_COMMAND(x) blas_unlock(x)
232 #define GOTO_SHMID 0x510510
245 #ifdef QUAD_PRECISION
249 #elif defined EXPRECISION
250 #define xdouble long double
252 #define xdouble double
255 #if defined(OS_WINDOWS) && defined(__64BIT__)
256 typedef long long BLASLONG;
257 typedef unsigned long long BLASULONG;
259 typedef long BLASLONG;
260 typedef unsigned long BLASULONG;
265 typedef uint16_t bfloat16;
266 #define BFLOAT16CONVERSION 1
270 typedef BLASLONG blasint;
271 #if defined(OS_WINDOWS) && defined(__64BIT__)
272 #define blasabs(x) llabs(x)
274 #define blasabs(x) labs(x)
278 #define blasabs(x) abs(x)
291 #define FLOAT xdouble
292 #ifdef QUAD_PRECISION
293 #define XFLOAT xidouble
295 #ifdef QUAD_PRECISION
298 #define ZBASE_SHIFT 6
302 #define ZBASE_SHIFT 5
304 #elif defined(DOUBLE)
308 #define ZBASE_SHIFT 4
309 #elif defined(BFLOAT16)
310 #define IFLOAT bfloat16
311 #define XFLOAT IFLOAT
315 #define ZBASE_SHIFT 2
320 #define ZBASE_SHIFT 3
338 #define Address_H(x) (((x)+(1<<15))>>16)
339 #define Address_L(x) ((x)-((Address_H(x))<<16))
341 #ifndef MAX_CPU_NUMBER
342 #define MAX_CPU_NUMBER 2
345 #if defined(OS_SUNOS)
346 #define YIELDING thr_yield()
349 #if defined(OS_WINDOWS)
350 #if defined(_MSC_VER) && !defined(__clang__)
351 #define YIELDING YieldProcessor()
353 #define YIELDING SwitchToThread()
357 #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5)
358 #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
363 #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
368 #if defined(POWER8) || defined(POWER9) || defined(POWER10)
370 #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
377 #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
385 #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
391 #define YIELDING sched_yield()
395 To alloc job_t on heap or statck.
396 please https://github.com/xianyi/OpenBLAS/issues/246
398 #if defined(OS_WINDOWS)
399 #define GETRF_MEM_ALLOC_THRESHOLD 32
400 #define BLAS3_MEM_ALLOC_THRESHOLD 32
403 #ifndef GETRF_MEM_ALLOC_THRESHOLD
404 #define GETRF_MEM_ALLOC_THRESHOLD 80
407 #ifndef BLAS3_MEM_ALLOC_THRESHOLD
408 #define BLAS3_MEM_ALLOC_THRESHOLD 32
411 #ifdef QUAD_PRECISION
412 #include "common_quad.h"
416 #include "common_alpha.h"
419 #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
420 #if __has_include(<cet.h>)
429 #include "common_x86.h"
433 #include "common_x86_64.h"
437 #include "common_ia64.h"
441 #include "common_power.h"
445 #include "common_sparc.h"
449 #include "common_mips.h"
454 #include "common_riscv64.h"
458 #include "common_mips64.h"
462 #include "common_arm.h"
466 #include "common_arm64.h"
470 #include "common_zarch.h"
473 #ifdef ARCH_LOONGARCH64
474 #include "common_loongarch64.h"
478 #include "common_e2k.h"
482 #ifdef OS_WINDOWSSTORE
483 typedef char env_var_t[MAX_PATH];
484 #define readenv(p, n) 0
486 #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
487 typedef char env_var_t[MAX_PATH];
488 #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
490 typedef char* env_var_t;
491 #define readenv(p, n) ((p)=getenv(n))
495 #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
496 #ifdef _POSIX_MONOTONIC_CLOCK
497 #if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17)
498 #if __GLIBC_PREREQ(2, 17) // don't require -lrt
499 #define USE_MONOTONIC
501 #elif defined(OS_ANDROID)
502 #define USE_MONOTONIC
505 /* use similar scale as x86 rdtsc for timeouts to work correctly */
506 static inline unsigned long long rpcc(void){
509 clock_gettime(CLOCK_MONOTONIC, &ts);
510 return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
511 #elif !defined(OS_EMBEDDED)
513 gettimeofday(&tv,NULL);
514 return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
521 #endif // !RPCC_DEFINED
523 #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
524 static void __inline blas_lock(volatile BLASULONG *address){
527 while (*address) {YIELDING;};
529 } while (!__sync_bool_compare_and_swap(address, 0, 1));
531 #define BLAS_LOCK_DEFINED
535 #error "rpcc() implementation is missing for your platform"
537 #ifndef BLAS_LOCK_DEFINED
538 #error "blas_lock() implementation is missing for your platform"
543 #include "common_linux.h"
547 #define DTB_DEFAULT_ENTRIES 64
550 #define MMAP_ACCESS (PROT_READ | PROT_WRITE)
553 #define MMAP_POLICY (MAP_PRIVATE | MAP_ANON)
555 #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS)
559 /* C99 supports complex floating numbers natively, which GCC also offers as an
560 extension since version 3.0. If neither are available, use a compatible
561 structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
562 #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
563 (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER)
564 #define OPENBLAS_COMPLEX_C99
568 typedef float _Complex openblas_complex_float;
569 typedef double _Complex openblas_complex_double;
570 typedef xdouble _Complex openblas_complex_xdouble;
571 #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I))
572 #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I))
573 #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I))
575 #define OPENBLAS_COMPLEX_STRUCT
576 typedef struct { float real, imag; } openblas_complex_float;
577 typedef struct { double real, imag; } openblas_complex_double;
578 typedef struct { xdouble real, imag; } openblas_complex_xdouble;
579 #define openblas_make_complex_float(real, imag) {(real), (imag)}
580 #define openblas_make_complex_double(real, imag) {(real), (imag)}
581 #define openblas_make_complex_xdouble(real, imag) {(real), (imag)}
586 #include "common_param.h"
589 #define STDERR stderr
593 #define MASK(a, b) (((a) + ((b) - 1)) & ~((b) - 1))
596 #if defined(XDOUBLE) || defined(DOUBLE)
597 #define FLOATRET FLOAT
600 #define FLOATRET double
602 #define FLOATRET float
608 /* Inclusion of a standard header file is needed for definition of __STDC_*
609 predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs
610 as a side effect of including either <features.h> or <stdc-predef.h>. */
615 #define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble
616 #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i)
617 #elif defined(DOUBLE)
618 #define OPENBLAS_COMPLEX_FLOAT openblas_complex_double
619 #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i)
621 #define OPENBLAS_COMPLEX_FLOAT openblas_complex_float
622 #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i)
625 #if defined(C_PGI) || defined(C_SUN)
626 #if defined(__STDC_IEC_559_COMPLEX__)
627 #define CREAL(X) creal(X)
628 #define CIMAG(X) cimag(X)
630 #define CREAL(X) (*((FLOAT *)&X + 0))
631 #define CIMAG(X) (*((FLOAT *)&X + 1))
634 #ifdef OPENBLAS_COMPLEX_STRUCT
635 #define CREAL(Z) ((Z).real)
636 #define CIMAG(Z) ((Z).imag)
638 #define CREAL __real__
639 #define CIMAG __imag__
653 #if defined(C_GCC) && (( __GNUC__ <= 3) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2)))
660 #define inline __inline
666 #define MIN(a,b) (a>b? b:a)
670 #define MAX(a,b) (a<b? b:a)
673 #define TOUPPER(a) {if ((a) > 0x60) (a) -= 0x20;}
675 #if defined(__FreeBSD__) || defined(__APPLE__)
676 #define MAP_ANONYMOUS MAP_ANON
679 /* Common Memory Management Routine */
680 void blas_set_parameter(void);
681 int blas_get_cpu_number(void);
682 void *blas_memory_alloc (int);
683 void blas_memory_free (void *);
684 void *blas_memory_alloc_nolock (int); //use malloc without blas_lock
685 void blas_memory_free_nolock (void *);
687 int get_num_procs (void);
689 #if defined(OS_LINUX) && defined(SMP) && !defined(NO_AFFINITY)
690 int get_num_nodes (void);
691 int get_num_proc (int);
692 int get_node_equal (void);
695 void goto_set_num_threads(int);
697 void gotoblas_affinity_init(void);
698 void gotoblas_affinity_quit(void);
699 void gotoblas_dynamic_init(void);
700 void gotoblas_dynamic_quit(void);
701 void gotoblas_profile_init(void);
702 void gotoblas_profile_quit(void);
704 int support_avx512(void);
709 int omp_in_parallel(void);
710 int omp_get_num_procs(void);
712 __declspec(dllimport) int __cdecl omp_in_parallel(void);
713 __declspec(dllimport) int __cdecl omp_get_num_procs(void);
717 #if defined(C_GCC) && ( __GNUC__ < 7)
718 // workaround for GCC bug 65467
720 #define _Atomic volatile
723 #include <stdatomic.h>
726 #define _Atomic volatile
732 int omp_in_parallel (void) __attribute__ ((weak));
733 int omp_get_num_procs(void) __attribute__ ((weak));
737 static __inline void blas_unlock(volatile BLASULONG *address){
742 #ifdef OS_WINDOWSSTORE
743 static __inline int readenv_atoi(char *env) {
748 static __inline int readenv_atoi(char *env) {
750 return readenv(p,env) ? 0 : atoi(p);
753 static __inline int readenv_atoi(char *env) {
755 if (( p = getenv(env) ))
763 #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
765 static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){
772 (fabsl(ar)) >= (fabsl(ai))
774 (fabs (ar)) >= (fabs (ai))
776 (fabsf(ar)) >= (fabsf(ai))
780 den = (FLOAT)(ONE / (ar * (ONE + ratio * ratio)));
785 den = (FLOAT)(ONE /(ai * (ONE + ratio * ratio)));
800 void *blas_debug_alloc(int);
801 void *blas_debug_free(void *);
804 #define malloc(a) blas_debug_alloc(a)
805 #define free(a) blas_debug_free (a)
809 #define GEMMRETTYPE int
819 #define GEMMRETTYPE copyoverhead_t
824 #define KNAME(A, B) A
826 #define KNAME(A, B) A##B
829 #include "common_interface.h"
831 #include "common_reference.h"
833 #include "common_macro.h"
834 #include "common_level1.h"
835 #include "common_level2.h"
836 #include "common_level3.h"
837 #include "common_lapack.h"
840 # define OPENBLAS_CONST /* see comment in cblas.h */
845 #include "common_stackalloc.h"
850 #if defined(SMP_SERVER) && defined(SMP_ONDEMAND)
851 #error Both SMP_SERVER and SMP_ONDEMAND are specified.
854 #if defined(SMP_SERVER) || defined(SMP_ONDEMAND)
855 #include "common_thread.h"
862 #ifndef DEFAULT_CPU_NUMBER
863 #define DEFAULT_CPU_NUMBER 4
874 #if !defined(ASSEMBLER) && defined(FUNCTION_PROFILE)
878 unsigned long long calls, fops, area, cycles, tcycles;
881 extern func_profile_t function_profile_table[];
882 extern int gotoblas_profile;
885 #define NUMOPT QNUMOPT
887 #define NUMOPT DNUMOPT
889 #define NUMOPT SNUMOPT
892 #define FUNCTION_PROFILE_START() { unsigned long long profile_start = rpcc(), profile_end;
894 #define FUNCTION_PROFILE_END(COMP, AREA, OPS) \
895 if (gotoblas_profile) { \
896 profile_end = rpcc(); \
897 function_profile_table[PROFILE_FUNC_NAME].calls ++; \
898 function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \
899 function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \
900 function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \
901 function_profile_table[PROFILE_FUNC_NAME].tcycles += blas_cpu_number * (profile_end - profile_start); \
905 #define FUNCTION_PROFILE_END(COMP, AREA, OPS) \
906 if (gotoblas_profile) { \
907 profile_end = rpcc(); \
908 function_profile_table[PROFILE_FUNC_NAME].calls ++; \
909 function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \
910 function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \
911 function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \
912 function_profile_table[PROFILE_FUNC_NAME].tcycles += (profile_end - profile_start); \
918 #define FUNCTION_PROFILE_START()
919 #define FUNCTION_PROFILE_END(COMP, AREA, OPS)
923 #define PRINT_DEBUG_CNAME
924 #define PRINT_DEBUG_NAME
926 #define PRINT_DEBUG_CNAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME)
927 #define PRINT_DEBUG_NAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME)
933 #endif /* __cplusplus */