1 /*****************************************************************************
2 Copyright (c) 2011-2014, The OpenBLAS Project
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
9 1. Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in
14 the documentation and/or other materials provided with the
16 3. Neither the name of the OpenBLAS project nor the names of
17 its contributors may be used to endorse or promote products
18 derived from this software without specific prior written
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 **********************************************************************************/
34 /*********************************************************************/
35 /* Copyright 2009, 2010 The University of Texas at Austin. */
36 /* All rights reserved. */
38 /* Redistribution and use in source and binary forms, with or */
39 /* without modification, are permitted provided that the following */
40 /* conditions are met: */
42 /* 1. Redistributions of source code must retain the above */
43 /* copyright notice, this list of conditions and the following */
46 /* 2. Redistributions in binary form must reproduce the above */
47 /* copyright notice, this list of conditions and the following */
48 /* disclaimer in the documentation and/or other materials */
49 /* provided with the distribution. */
51 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
52 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
53 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
54 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
55 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
56 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
57 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
58 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
59 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
60 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
61 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
62 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
63 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
64 /* POSSIBILITY OF SUCH DAMAGE. */
66 /* The views and conclusions contained in the software and */
67 /* documentation are those of the authors and should not be */
68 /* interpreted as representing official policies, either expressed */
69 /* or implied, of The University of Texas at Austin. */
70 /*********************************************************************/
77 #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
79 #ifndef MEM_LARGE_PAGES
80 #define MEM_LARGE_PAGES 0x20000000
91 #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
99 #include <sys/types.h>
102 #include <sys/sysinfo.h>
105 #include <linux/unistd.h>
106 #include <sys/syscall.h>
107 #include <sys/time.h>
108 #include <sys/resource.h>
115 #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
116 #include <sys/sysctl.h>
117 #include <sys/resource.h>
120 #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
123 #define printf _cprintf
128 #ifndef MPOL_PREFERRED
129 #define MPOL_PREFERRED 1
134 #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
139 #define SHM_HUGETLB 04000
142 #ifndef FIXED_PAGESIZE
143 #define FIXED_PAGESIZE 4096
146 #ifndef BUFFERS_PER_THREAD
147 #ifdef USE_OPENMP_UNUSED
148 #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
150 #define BUFFERS_PER_THREAD NUM_BUFFERS
154 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
156 #if defined(_MSC_VER) && !defined(__clang__)
157 #define CONSTRUCTOR __cdecl
158 #define DESTRUCTOR __cdecl
159 #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
160 #define CONSTRUCTOR __attribute__ ((constructor))
161 #define DESTRUCTOR __attribute__ ((destructor))
162 #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
163 #define CONSTRUCTOR __attribute__ ((constructor(101)))
164 #define DESTRUCTOR __attribute__ ((destructor(101)))
166 #define CONSTRUCTOR __attribute__ ((constructor))
167 #define DESTRUCTOR __attribute__ ((destructor))
171 gotoblas_t *gotoblas = NULL;
173 extern void openblas_warning(int verbose, const char * msg);
177 #define blas_cpu_number 1
178 #define blas_num_threads 1
181 int goto_get_num_procs (void) { return 1;};
182 void goto_set_num_threads(int num_threads) {};
186 #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
188 int get_num_procs(void);
190 int get_num_procs(void) {
197 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
198 #if !defined(OS_LINUX)
202 #if !defined(__GLIBC_PREREQ)
205 #if !__GLIBC_PREREQ(2, 3)
209 #if !__GLIBC_PREREQ(2, 7)
210 ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
211 if (ret!=0) return nums;
213 #if !__GLIBC_PREREQ(2, 6)
215 if (CPU_ISSET(i,cpusetp)) n++;
218 nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
222 cpusetp = CPU_ALLOC(nums);
223 if (cpusetp == NULL) return nums;
224 size = CPU_ALLOC_SIZE(nums);
225 ret = sched_getaffinity(0,size,cpusetp);
226 if (ret!=0) return nums;
227 ret = CPU_COUNT_S(size,cpusetp);
228 if (ret > 0 && ret < nums) nums = ret;
238 int get_num_procs(void) {
240 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
246 int get_num_procs(void) {
248 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
255 int get_num_procs(void) {
263 GetSystemInfo(&sysinfo);
265 nums = sysinfo.dwNumberOfProcessors;
273 #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
275 int get_num_procs(void) {
286 sysctl(m, 2, &nums, &len, NULL, 0);
294 #if defined(OS_DARWIN)
295 int get_num_procs(void) {
300 sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
305 void set_stack_limit(int limitMB){
310 StackSize=limitMB*1024*1024;
311 result=getrlimit(RLIMIT_STACK, &rl);
313 if(rl.rlim_cur < StackSize){
314 rl.rlim_cur=StackSize;
315 result=setrlimit(RLIMIT_STACK, &rl);
317 fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
327 OpenBLAS uses the numbers of CPU cores in multithreading.
328 It can be set by openblas_set_num_threads(int num_threads);
330 int blas_cpu_number = 0;
332 The numbers of threads in the thread pool.
333 This value is equal or large than blas_cpu_number. This means some threads are sleep.
335 int blas_num_threads = 0;
337 int goto_get_num_procs (void) {
338 return blas_cpu_number;
341 static void blas_memory_init();
343 void openblas_fork_handler()
345 // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
346 // built with "make USE_OPENMP=0".
347 // Hanging can still happen when OpenBLAS is built against the libgomp
348 // implementation of OpenMP. The problem is tracked at:
349 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
350 // In the mean time build with USE_OPENMP=0 or link against another
351 // implementation of OpenMP.
352 #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
354 err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
356 openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
360 extern int openblas_num_threads_env();
361 extern int openblas_goto_num_threads_env();
362 extern int openblas_omp_num_threads_env();
364 int blas_get_cpu_number(void){
365 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
368 int blas_goto_num = 0;
369 int blas_omp_num = 0;
371 if (blas_num_threads) return blas_num_threads;
373 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
374 max_num = get_num_procs();
377 // blas_goto_num = 0;
378 #ifndef USE_OPENMP_UNUSED
379 blas_goto_num=openblas_num_threads_env();
380 if (blas_goto_num < 0) blas_goto_num = 0;
382 if (blas_goto_num == 0) {
383 blas_goto_num=openblas_goto_num_threads_env();
384 if (blas_goto_num < 0) blas_goto_num = 0;
390 blas_omp_num=openblas_omp_num_threads_env();
391 if (blas_omp_num < 0) blas_omp_num = 0;
393 if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
394 else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
395 else blas_num_threads = MAX_CPU_NUMBER;
397 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
398 if (blas_num_threads > max_num) blas_num_threads = max_num;
401 if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
404 printf( "Adjusted number of threads : %3d\n", blas_num_threads);
407 blas_cpu_number = blas_num_threads;
409 return blas_num_threads;
414 int openblas_get_num_procs(void) {
418 return get_num_procs();
422 int openblas_get_num_threads(void) {
426 // init blas_cpu_number if needed
427 blas_get_cpu_number();
428 return blas_cpu_number;
432 int hugetlb_allocated = 0;
434 #if defined(OS_WINDOWS)
435 #define THREAD_LOCAL __declspec(thread)
436 #define LIKELY_ONE(x) (x)
438 #define THREAD_LOCAL __thread
439 #define LIKELY_ONE(x) (__builtin_expect(x, 1))
442 /* Stores information about the allocation and how to release it */
444 /* Whether this allocation is being used */
446 /* Any special attributes needed when releasing this allocation */
448 /* Function that can properly release this memory */
449 void (*release_func)(struct alloc_t *);
450 /* Pad to 64-byte alignment */
451 char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
454 /* Convenience macros for storing release funcs */
455 #define STORE_RELEASE_FUNC(address, func) \
456 if (address != (void *)-1) { \
457 struct alloc_t *alloc_info = (struct alloc_t *)address; \
458 alloc_info->release_func = func; \
461 #define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
462 if (address != (void *)-1) { \
463 struct alloc_t *alloc_info = (struct alloc_t *)address; \
464 alloc_info->release_func = func; \
465 alloc_info->attr = attr; \
468 /* The number of bytes that will be allocated for each buffer. When allocating
469 memory, we store an alloc_t followed by the actual buffer memory. This means
470 that each allocation always has its associated alloc_t, without the need
471 for an auxiliary tracking structure. */
472 static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
474 /* Clang supports TLS from version 2.8 */
475 #if defined(__clang__) && __clang_major__ > 2 || \
476 (__clang_minor__ == 2 || __clang_minor__ == 8)
477 #define HAS_COMPILER_TLS
480 /* GCC supports TLS from version 4.1 */
481 #if !defined(__clang__) && defined(__GNUC__) && \
482 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
483 #define HAS_COMPILER_TLS
486 /* MSVC supports TLS from version 2005 */
487 #if defined(_MSC_VER) && _MSC_VER >= 1400
488 #define HAS_COMPILER_TLS
491 /* Versions of XCode before 8 did not properly support TLS */
492 #if defined(__apple_build_version__) && __apple_build_version__ < 8000042
493 #undef HAS_COMPILER_TLS
496 /* Android NDK's before version 12b did not support TLS */
497 #if defined(__ANDROID__) && defined(__clang__)
498 #if __has_include(<android/ndk-version.h>)
499 #include <android/ndk-version.h>
501 #if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
502 defined(__NDK_MINOR__) && \
503 ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
504 #undef HAS_COMPILER_TLS
508 /* Holds pointers to allocated memory */
509 #if defined(SMP) && !defined(USE_OPENMP_UNUSED)
510 /* This is the number of threads than can be spawned by the server, which is the
511 server plus the number of threads in the thread pool */
512 # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2
513 static int next_memory_table_pos = 0;
514 # if defined(HAS_COMPILER_TLS)
515 /* Use compiler generated thread-local-storage */
516 static int THREAD_LOCAL local_memory_table_pos = 0;
518 /* Use system-dependent thread-local-storage */
519 # if defined(OS_WINDOWS)
520 static DWORD local_storage_key;
522 static pthread_key_t local_storage_key;
523 # endif /* defined(OS_WINDOWS) */
524 # endif /* defined(HAS_COMPILER_TLS) */
526 /* There is only one allocating thread when in single-threaded mode and when using OpenMP */
527 # define MAX_ALLOCATING_THREADS 1
528 #endif /* defined(SMP) && !defined(USE_OPENMP) */
529 static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
531 #if defined(OS_LINUX) && !defined(NO_WARMUP)
532 static int hot_alloc = 0;
535 /* Global lock for memory allocation */
537 #if defined(USE_PTHREAD_LOCK)
538 static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
539 #elif defined(USE_PTHREAD_SPINLOCK)
540 static pthread_spinlock_t alloc_lock = 0;
542 static BLASULONG alloc_lock = 0UL;
545 /* Returns a pointer to the start of the per-thread memory allocation data */
546 static __inline struct alloc_t ** get_memory_table() {
547 #if defined(SMP) && !defined(USE_OPENMP_UNUSED)
548 # if !defined(HAS_COMPILER_TLS)
549 # if defined(OS_WINDOWS)
550 int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
552 int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
553 # endif /* defined(OS_WINDOWS) */
554 # endif /* !defined(HAS_COMPILER_TLS) */
555 if (!local_memory_table_pos) {
556 LOCK_COMMAND(&alloc_lock);
557 local_memory_table_pos = next_memory_table_pos++;
558 if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
559 printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
560 UNLOCK_COMMAND(&alloc_lock);
561 # if !defined(HAS_COMPILER_TLS)
562 # if defined(OS_WINDOWS)
563 ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
565 pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
566 # endif /* defined(OS_WINDOWS) */
567 # endif /* !defined(HAS_COMPILER_TLS) */
569 return local_memory_table[local_memory_table_pos];
571 return local_memory_table[0];
572 #endif /* defined(SMP) && !defined(USE_OPENMP) */
577 static void alloc_mmap_free(struct alloc_t *alloc_info){
579 if (munmap(alloc_info, allocation_block_size)) {
580 printf("OpenBLAS : munmap failed\n");
588 static void *alloc_mmap(void *address){
592 map_address = mmap(address,
593 allocation_block_size,
594 MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
596 map_address = mmap(address,
597 allocation_block_size,
598 MMAP_ACCESS, MMAP_POLICY, -1, 0);
601 STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
604 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
612 #define BENCH_ITERATION 4
615 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
617 BLASULONG original, *p;
618 BLASULONG start, stop, min;
623 original = *(BLASULONG *)(address + size - PAGESIZE);
625 *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
627 for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
629 p = (BLASULONG *)address;
631 count = size / PAGESIZE;
635 for (i = 0; i < count; i ++) {
636 p = (BLASULONG *)(*p);
641 if (min > stop - start) min = stop - start;
644 *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
645 *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
650 static void *alloc_mmap(void *address){
651 void *map_address, *best_address;
652 BLASULONG best, start, current, original;
656 /* Just give up use advanced operation */
657 map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
660 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
664 #if defined(OS_LINUX) && !defined(NO_WARMUP)
665 if (hot_alloc == 0) {
666 map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
669 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
675 map_address = mmap(NULL, allocation_block_size * SCALING,
676 MMAP_ACCESS, MMAP_POLICY, -1, 0);
678 if (map_address != (void *)-1) {
683 ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
686 perror("OpenBLAS alloc_mmap:");
687 printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
691 my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
696 allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
698 start = (BLASULONG)map_address;
699 current = (SCALING - 1) * allocation_block_size;
702 while(current > 0 && current <= original) {
703 *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
708 *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
710 start = (BLASULONG)map_address;
712 best = (BLASULONG)-1;
713 best_address = map_address;
715 while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
717 current = run_bench(start, allocsize);
719 if (best > current) {
721 best_address = (void *)start;
728 if ((BLASULONG)best_address > (BLASULONG)map_address)
729 munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
731 munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
733 map_address = best_address;
735 #if defined(OS_LINUX) && !defined(NO_WARMUP)
740 #if defined(OS_LINUX) && !defined(NO_WARMUP)
744 STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
756 static void alloc_malloc_free(struct alloc_t *alloc_info){
762 static void *alloc_malloc(void *address){
766 map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
768 if (map_address == (void *)NULL) map_address = (void *)-1;
770 STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
780 void *qalloc(int flags, size_t bytes);
781 void *qfree (void *address);
783 #define QNONCACHE 0x1
787 static void alloc_qalloc_free(struct alloc_t *alloc_info){
793 static void *alloc_qalloc(void *address){
796 map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
798 if (map_address == (void *)NULL) map_address = (void *)-1;
800 STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
802 return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
809 static void alloc_windows_free(struct alloc_t *alloc_info){
811 VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
815 static void *alloc_windows(void *address){
818 map_address = VirtualAlloc(address,
819 allocation_block_size,
820 MEM_RESERVE | MEM_COMMIT,
823 if (map_address == (void *)NULL) map_address = (void *)-1;
825 STORE_RELEASE_FUNC(map_address, alloc_windows_free);
832 #ifdef ALLOC_DEVICEDRIVER
833 #ifndef DEVICEDRIVER_NAME
834 #define DEVICEDRIVER_NAME "/dev/mapper"
837 static void alloc_devicedirver_free(struct alloc_t *alloc_info){
839 int attr = alloc_info -> attr;
840 if (munmap(address, allocation_block_size)) {
841 printf("OpenBLAS : Bugphysarea unmap failed.\n");
845 printf("OpenBLAS : Bugphysarea close failed.\n");
850 static void *alloc_devicedirver(void *address){
855 if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
861 map_address = mmap(address, allocation_block_size,
862 PROT_READ | PROT_WRITE,
863 MAP_FILE | MAP_SHARED,
866 STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
875 static void alloc_shm_free(struct alloc_t *alloc_info){
877 if (shmdt(alloc_info)) {
878 printf("OpenBLAS : Shared memory unmap failed.\n");
882 static void *alloc_shm(void *address){
886 shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
888 map_address = (void *)shmat(shmid, address, 0);
890 if (map_address != (void *)-1){
893 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
896 shmctl(shmid, IPC_RMID, 0);
898 struct alloc_t *alloc_info = (struct alloc_t *)map_address;
899 alloc_info->release_func = alloc_shm_free;
900 alloc_info->attr = shmid;
906 #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
908 static void alloc_hugetlb_free(struct alloc_t *alloc_info){
910 #if defined(OS_LINUX) || defined(OS_AIX)
911 if (shmdt(alloc_info)) {
912 printf("OpenBLAS : Hugepage unmap failed.\n");
918 munmap(alloc_info, allocation_block_size);
924 VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
930 static void *alloc_hugetlb(void *address){
932 void *map_address = (void *)-1;
934 #if defined(OS_LINUX) || defined(OS_AIX)
937 shmid = shmget(IPC_PRIVATE, allocation_block_size,
942 SHM_LGPAGE | SHM_PIN |
944 IPC_CREAT | SHM_R | SHM_W);
947 map_address = (void *)shmat(shmid, address, SHM_RND);
950 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
953 if (map_address != (void *)-1){
954 shmctl(shmid, IPC_RMID, 0);
960 struct memcntl_mha mha;
962 mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
964 mha.mha_pagesize = HUGE_PAGESIZE;
965 memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
967 map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
975 if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
977 tp.PrivilegeCount = 1;
978 tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
980 if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
985 if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
990 map_address = (void *)VirtualAlloc(address,
991 allocation_block_size,
992 MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
995 tp.Privileges[0].Attributes = 0;
996 AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
998 if (map_address == (void *)NULL) map_address = (void *)-1;
1002 STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
1010 #ifdef ALLOC_HUGETLBFILE
1012 static int hugetlb_pid = 0;
1014 static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
1016 int attr = alloc_info -> attr;
1017 if (munmap(alloc_info, allocation_block_size)) {
1018 printf("OpenBLAS : HugeTLBfs unmap failed.\n");
1022 printf("OpenBLAS : HugeTLBfs close failed.\n");
1026 static void *alloc_hugetlbfile(void *address){
1028 void *map_address = (void *)-1;
1032 if (!hugetlb_pid) hugetlb_pid = getpid();
1034 sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
1036 if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
1042 map_address = mmap(address, allocation_block_size,
1043 PROT_READ | PROT_WRITE,
1047 STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
1055 static BLASULONG base_address = 0UL;
1057 static BLASULONG base_address = BASE_ADDRESS;
1060 #if __STDC_VERSION__ >= 201112L
1061 static _Atomic int memory_initialized = 0;
1063 static volatile int memory_initialized = 0;
1066 /* Memory allocation routine */
1067 /* procpos ... indicates where it comes from */
1068 /* 0 : Level 3 functions */
1069 /* 1 : Level 2 functions */
1072 static void blas_memory_init(){
1073 #if defined(SMP) && !defined(USE_OPENMP_UNUSED)
1074 next_memory_table_pos = 0;
1075 # if !defined(HAS_COMPILER_TLS)
1076 # if defined(OS_WINDOWS)
1077 local_storage_key = ::TlsAlloc();
1079 pthread_key_create(&local_storage_key, NULL);
1080 # endif /* defined(OS_WINDOWS) */
1081 # endif /* defined(HAS_COMPILER_TLS) */
1082 #endif /* defined(SMP) && !defined(USE_OPENMP) */
1083 memset(local_memory_table, 0, sizeof(local_memory_table));
1086 void *blas_memory_alloc(int procpos){
1092 void *(*memoryalloc[])(void *address) = {
1093 #ifdef ALLOC_DEVICEDRIVER
1096 /* Hugetlb implicitly assumes ALLOC_SHM */
1100 #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
1109 #ifdef ALLOC_WINDOWS
1117 void *(**func)(void *address);
1118 struct alloc_t * alloc_info;
1119 struct alloc_t ** alloc_table;
1121 if (!LIKELY_ONE(memory_initialized)) {
1122 #if defined(SMP) && !defined(USE_OPENMP)
1123 /* Only allow a single thread to initialize memory system */
1124 LOCK_COMMAND(&alloc_lock);
1126 if (!memory_initialized) {
1130 gotoblas_dynamic_init();
1133 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1134 gotoblas_affinity_init();
1138 if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
1141 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
1142 #ifndef DYNAMIC_ARCH
1143 blas_set_parameter();
1147 memory_initialized = 1;
1149 #if defined(SMP) && !defined(USE_OPENMP)
1151 UNLOCK_COMMAND(&alloc_lock);
1156 printf("Alloc Start ...\n");
1160 alloc_table = get_memory_table();
1162 if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
1165 } while (position < BUFFERS_PER_THREAD);
1172 printf(" Position -> %d\n", position);
1175 alloc_info = alloc_table[position];
1179 printf("Allocation Start : %lx\n", base_address);
1182 map_address = (void *)-1;
1184 func = &memoryalloc[0];
1186 while ((func != NULL) && (map_address == (void *) -1)) {
1188 map_address = (*func)((void *)base_address);
1190 #ifdef ALLOC_DEVICEDRIVER
1191 if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
1192 fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
1196 #ifdef ALLOC_HUGETLBFILE
1197 if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
1199 fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
1204 #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
1205 if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
1212 printf(" Success -> %08lx\n", map_address);
1214 if (((BLASLONG) map_address) == -1) base_address = 0UL;
1216 if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
1218 } while ((BLASLONG)map_address == -1);
1220 alloc_table[position] = alloc_info = map_address;
1223 printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
1228 printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
1231 alloc_info->used = 1;
1233 return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
1236 printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
1241 void blas_memory_free(void *buffer){
1244 struct alloc_t ** alloc_table;
1246 /* Since we passed an offset pointer to the caller, get back to the actual allocation */
1247 struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
1250 printf("Unmapped Start : %p ...\n", alloc_info);
1253 alloc_info->used = 0;
1256 printf("Unmap Succeeded.\n\n");
1262 alloc_table = get_memory_table();
1263 for (position = 0; position < BUFFERS_PER_THREAD; position++){
1264 if (alloc_table[position]) {
1265 printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
1272 void *blas_memory_alloc_nolock(int unused) {
1274 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
1278 void blas_memory_free_nolock(void * map_address) {
1282 void blas_shutdown(void){
1287 BLASFUNC(blas_thread_shutdown)();
1290 for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
1291 for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
1292 struct alloc_t *alloc_info = local_memory_table[thread][pos];
1294 alloc_info->release_func(alloc_info);
1295 local_memory_table[thread][pos] = (void *)0;
1303 base_address = BASE_ADDRESS;
1309 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1312 #if defined(USE_PTHREAD_LOCK)
1313 static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
1314 #elif defined(USE_PTHREAD_SPINLOCK)
1315 static pthread_spinlock_t init_lock = 0;
1317 static BLASULONG init_lock = 0UL;
1321 static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
1322 void *sa, void *sb, BLASLONG pos) {
1324 #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
1329 size = allocation_block_size - PAGESIZE;
1330 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1332 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1333 if (hot_alloc != 2) {
1337 LOCK_COMMAND(&init_lock);
1341 *(int *)buffer = size;
1347 UNLOCK_COMMAND(&init_lock);
1350 size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
1351 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1354 *(int *)buffer = size;
1359 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1368 static void _init_thread_memory(void *buffer) {
1370 blas_queue_t queue[MAX_CPU_NUMBER];
1373 for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
1375 blas_queue_init(&queue[num_cpu]);
1376 queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
1377 queue[num_cpu].routine = &_touch_memory;
1378 queue[num_cpu].args = NULL;
1379 queue[num_cpu].next = &queue[num_cpu + 1];
1382 queue[num_cpu - 1].next = NULL;
1383 queue[0].sa = buffer;
1385 exec_blas(num_cpu, queue);
1390 static void gotoblas_memory_init(void) {
1396 buffer = (void *)blas_memory_alloc(0);
1399 if (blas_cpu_number == 0) blas_get_cpu_number();
1401 if (blas_server_avail == 0) blas_thread_init();
1404 _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
1408 _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
1412 blas_memory_free(buffer);
1416 /* Initialization for all function; this function should be called before main */
1418 static int gotoblas_initialized = 0;
1419 extern void openblas_read_env();
1421 void CONSTRUCTOR gotoblas_init(void) {
1423 if (gotoblas_initialized) return;
1426 openblas_fork_handler();
1429 openblas_read_env();
1436 gotoblas_dynamic_init();
1439 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1440 gotoblas_affinity_init();
1443 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1444 gotoblas_memory_init();
1447 //#if defined(OS_LINUX)
1449 struct rlimit curlimit;
1450 if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
1452 if ( curlimit.rlim_cur != curlimit.rlim_max )
1454 curlimit.rlim_cur = curlimit.rlim_max;
1455 setrlimit(RLIMIT_STACK, &curlimit);
1461 if (blas_cpu_number == 0) blas_get_cpu_number();
1463 if (blas_server_avail == 0) blas_thread_init();
1467 #ifdef FUNCTION_PROFILE
1468 gotoblas_profile_init();
1471 gotoblas_initialized = 1;
1479 void DESTRUCTOR gotoblas_quit(void) {
1481 if (gotoblas_initialized == 0) return;
1489 #ifdef FUNCTION_PROFILE
1490 gotoblas_profile_quit();
1493 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1494 gotoblas_affinity_quit();
1498 gotoblas_dynamic_quit();
1501 gotoblas_initialized = 0;
1508 #if defined(_MSC_VER) && !defined(__clang__)
1509 BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
1511 switch (ul_reason_for_call)
1513 case DLL_PROCESS_ATTACH:
1516 case DLL_THREAD_ATTACH:
1518 case DLL_THREAD_DETACH:
1520 case DLL_PROCESS_DETACH:
1530 This is to allow static linking.
1531 Code adapted from Google performance tools:
1532 https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
1534 https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
1535 http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
1537 static int on_process_term(void)
1543 #pragma comment(linker, "/INCLUDE:_tls_used")
1545 #pragma comment(linker, "/INCLUDE:__tls_used")
1549 #pragma const_seg(".CRT$XLB")
1551 #pragma data_seg(".CRT$XLB")
1553 static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
1561 #pragma const_seg(".CRT$XTU")
1563 #pragma data_seg(".CRT$XTU")
1565 static int(*p_process_term)(void) = on_process_term;
1573 #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
1574 /* Don't call me; this is just work around for PGI / Sun bug */
1575 void gotoblas_dummy_for_PGI(void) {
1581 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
1582 asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
1584 asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
1585 asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");