1 /*****************************************************************************
2 Copyright (c) 2011-2014, The OpenBLAS Project
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
9 1. Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in
14 the documentation and/or other materials provided with the
16 3. Neither the name of the OpenBLAS project nor the names of
17 its contributors may be used to endorse or promote products
18 derived from this software without specific prior written
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 **********************************************************************************/
34 /*********************************************************************/
35 /* Copyright 2009, 2010 The University of Texas at Austin. */
36 /* All rights reserved. */
38 /* Redistribution and use in source and binary forms, with or */
39 /* without modification, are permitted provided that the following */
40 /* conditions are met: */
42 /* 1. Redistributions of source code must retain the above */
43 /* copyright notice, this list of conditions and the following */
46 /* 2. Redistributions in binary form must reproduce the above */
47 /* copyright notice, this list of conditions and the following */
48 /* disclaimer in the documentation and/or other materials */
49 /* provided with the distribution. */
51 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
52 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
53 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
54 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
55 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
56 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
57 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
58 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
59 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
60 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
61 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
62 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
63 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
64 /* POSSIBILITY OF SUCH DAMAGE. */
66 /* The views and conclusions contained in the software and */
67 /* documentation are those of the authors and should not be */
68 /* interpreted as representing official policies, either expressed */
69 /* or implied, of The University of Texas at Austin. */
70 /*********************************************************************/
76 #if defined(USE_TLS) && defined(SMP)
83 #if defined(__GLIBC_PREREQ)
84 #if !__GLIBC_PREREQ(2,20)
90 #if defined(COMPILE_TLS)
94 #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
96 #ifndef MEM_LARGE_PAGES
97 #define MEM_LARGE_PAGES 0x20000000
108 #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
109 #include <sys/mman.h>
116 #include <sys/types.h>
119 #include <sys/sysinfo.h>
122 #include <linux/unistd.h>
123 #include <sys/syscall.h>
124 #include <sys/time.h>
125 #include <sys/resource.h>
132 #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
133 #include <sys/sysctl.h>
134 #include <sys/resource.h>
137 #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
140 #define printf _cprintf
145 #ifndef MPOL_PREFERRED
146 #define MPOL_PREFERRED 1
151 #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
156 #define SHM_HUGETLB 04000
159 #ifndef FIXED_PAGESIZE
160 #define FIXED_PAGESIZE 4096
163 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
165 #if defined(_MSC_VER) && !defined(__clang__)
166 #define CONSTRUCTOR __cdecl
167 #define DESTRUCTOR __cdecl
168 #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
169 #define CONSTRUCTOR __attribute__ ((constructor))
170 #define DESTRUCTOR __attribute__ ((destructor))
171 #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
172 #define CONSTRUCTOR __attribute__ ((constructor(101)))
173 #define DESTRUCTOR __attribute__ ((destructor(101)))
175 #define CONSTRUCTOR __attribute__ ((constructor))
176 #define DESTRUCTOR __attribute__ ((destructor))
180 gotoblas_t *gotoblas = NULL;
182 extern void openblas_warning(int verbose, const char * msg);
186 #define blas_cpu_number 1
187 #define blas_num_threads 1
190 int goto_get_num_procs (void) { return 1;};
191 void goto_set_num_threads(int num_threads) {};
195 #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
197 int get_num_procs(void);
199 int get_num_procs(void) {
206 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
207 #if !defined(OS_LINUX)
211 #if !defined(__GLIBC_PREREQ)
214 #if !__GLIBC_PREREQ(2, 3)
218 #if !__GLIBC_PREREQ(2, 7)
219 ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
220 if (ret!=0) return nums;
222 #if !__GLIBC_PREREQ(2, 6)
224 if (CPU_ISSET(i,cpusetp)) n++;
227 nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
231 cpusetp = CPU_ALLOC(nums);
232 if (cpusetp == NULL) return nums;
233 size = CPU_ALLOC_SIZE(nums);
234 ret = sched_getaffinity(0,size,cpusetp);
235 if (ret!=0) return nums;
236 ret = CPU_COUNT_S(size,cpusetp);
237 if (ret > 0 && ret < nums) nums = ret;
247 int get_num_procs(void) {
249 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
255 int get_num_procs(void) {
257 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
263 int get_num_procs(void) {
265 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
274 int get_num_procs(void) {
282 GetSystemInfo(&sysinfo);
284 nums = sysinfo.dwNumberOfProcessors;
292 #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
294 int get_num_procs(void) {
305 sysctl(m, 2, &nums, &len, NULL, 0);
313 #if defined(OS_DARWIN)
314 int get_num_procs(void) {
319 sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
324 void set_stack_limit(int limitMB){
329 StackSize=limitMB*1024*1024;
330 result=getrlimit(RLIMIT_STACK, &rl);
332 if(rl.rlim_cur < StackSize){
333 rl.rlim_cur=StackSize;
334 result=setrlimit(RLIMIT_STACK, &rl);
336 fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
346 OpenBLAS uses the numbers of CPU cores in multithreading.
347 It can be set by openblas_set_num_threads(int num_threads);
349 int blas_cpu_number = 0;
351 The numbers of threads in the thread pool.
352 This value is equal or large than blas_cpu_number. This means some threads are sleep.
354 int blas_num_threads = 0;
356 int goto_get_num_procs (void) {
357 return blas_cpu_number;
360 static void blas_memory_init();
362 void openblas_fork_handler()
364 // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
365 // built with "make USE_OPENMP=0".
366 // Hanging can still happen when OpenBLAS is built against the libgomp
367 // implementation of OpenMP. The problem is tracked at:
368 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
369 // In the mean time build with USE_OPENMP=0 or link against another
370 // implementation of OpenMP.
371 #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
373 err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
375 openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
379 extern int openblas_num_threads_env();
380 extern int openblas_goto_num_threads_env();
381 extern int openblas_omp_num_threads_env();
383 int blas_get_cpu_number(void){
384 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
387 int blas_goto_num = 0;
388 int blas_omp_num = 0;
390 if (blas_num_threads) return blas_num_threads;
392 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
393 max_num = get_num_procs();
396 // blas_goto_num = 0;
397 #ifndef USE_OPENMP_UNUSED
398 blas_goto_num=openblas_num_threads_env();
399 if (blas_goto_num < 0) blas_goto_num = 0;
401 if (blas_goto_num == 0) {
402 blas_goto_num=openblas_goto_num_threads_env();
403 if (blas_goto_num < 0) blas_goto_num = 0;
409 blas_omp_num=openblas_omp_num_threads_env();
410 if (blas_omp_num < 0) blas_omp_num = 0;
412 if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
413 else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
414 else blas_num_threads = MAX_CPU_NUMBER;
416 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
417 if (blas_num_threads > max_num) blas_num_threads = max_num;
420 if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
423 printf( "Adjusted number of threads : %3d\n", blas_num_threads);
426 blas_cpu_number = blas_num_threads;
428 return blas_num_threads;
433 int openblas_get_num_procs(void) {
437 return get_num_procs();
441 int openblas_get_num_threads(void) {
445 // init blas_cpu_number if needed
446 blas_get_cpu_number();
447 return blas_cpu_number;
451 int hugetlb_allocated = 0;
453 #if defined(OS_WINDOWS)
454 #define LIKELY_ONE(x) (x)
456 #define LIKELY_ONE(x) (__builtin_expect(x, 1))
459 /* Stores information about the allocation and how to release it */
461 /* Whether this allocation is being used */
463 /* Any special attributes needed when releasing this allocation */
465 /* Function that can properly release this memory */
466 void (*release_func)(struct alloc_t *);
467 /* Pad to 64-byte alignment */
468 char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
471 /* Convenience macros for storing release funcs */
472 #define STORE_RELEASE_FUNC(address, func) \
473 if (address != (void *)-1) { \
474 struct alloc_t *alloc_info = (struct alloc_t *)address; \
475 alloc_info->release_func = func; \
478 #define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
479 if (address != (void *)-1) { \
480 struct alloc_t *alloc_info = (struct alloc_t *)address; \
481 alloc_info->release_func = func; \
482 alloc_info->attr = attr; \
485 /* The number of bytes that will be allocated for each buffer. When allocating
486 memory, we store an alloc_t followed by the actual buffer memory. This means
487 that each allocation always has its associated alloc_t, without the need
488 for an auxiliary tracking structure. */
489 static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
492 # if defined(OS_WINDOWS)
493 static DWORD local_storage_key = 0;
497 static pthread_key_t local_storage_key = 0;
499 # endif /* defined(OS_WINDOWS) */
500 #endif /* defined(SMP) */
502 #if defined(OS_LINUX) && !defined(NO_WARMUP)
503 static int hot_alloc = 0;
506 /* Global lock for memory allocation */
508 #if defined(USE_PTHREAD_LOCK)
509 static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
510 #elif defined(USE_PTHREAD_SPINLOCK)
511 static pthread_spinlock_t alloc_lock = 0;
513 static BLASULONG alloc_lock = 0UL;
516 #if defined(USE_PTHREAD_LOCK)
517 static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER;
518 #elif defined(USE_PTHREAD_SPINLOCK)
519 static pthread_spinlock_t key_lock = 0;
521 static BLASULONG key_lock = 0UL;
524 /* Returns a pointer to the start of the per-thread memory allocation data */
525 static __inline struct alloc_t ** get_memory_table() {
527 LOCK_COMMAND(&key_lock);
528 lsk=local_storage_key;
529 UNLOCK_COMMAND(&key_lock);
533 # if defined(OS_WINDOWS)
534 struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key);
536 struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key);
537 # endif /* defined(OS_WINDOWS) */
539 static struct alloc_t ** local_memory_table = NULL;
540 #endif /* defined(SMP) */
542 LOCK_COMMAND(&key_lock);
543 lsk=local_storage_key;
544 UNLOCK_COMMAND(&key_lock);
545 if (lsk && !local_memory_table) {
547 if (!local_memory_table) {
548 #endif /* defined(SMP) */
549 local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS);
550 memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
552 # if defined(OS_WINDOWS)
553 LOCK_COMMAND(&key_lock);
554 TlsSetValue(local_storage_key, (void*)local_memory_table);
555 UNLOCK_COMMAND(&key_lock);
557 LOCK_COMMAND(&key_lock);
558 pthread_setspecific(local_storage_key, (void*)local_memory_table);
559 UNLOCK_COMMAND(&key_lock);
560 # endif /* defined(OS_WINDOWS) */
561 #endif /* defined(SMP) */
563 return local_memory_table;
568 static void alloc_mmap_free(struct alloc_t *alloc_info){
570 if (munmap(alloc_info, allocation_block_size)) {
571 printf("OpenBLAS : munmap failed\n");
579 static void *alloc_mmap(void *address){
583 map_address = mmap(address,
584 allocation_block_size,
585 MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
587 map_address = mmap(address,
588 allocation_block_size,
589 MMAP_ACCESS, MMAP_POLICY, -1, 0);
592 STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
595 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
603 #define BENCH_ITERATION 4
606 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
608 BLASULONG original, *p;
609 BLASULONG start, stop, min;
614 original = *(BLASULONG *)(address + size - PAGESIZE);
616 *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
618 for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
620 p = (BLASULONG *)address;
622 count = size / PAGESIZE;
626 for (i = 0; i < count; i ++) {
627 p = (BLASULONG *)(*p);
632 if (min > stop - start) min = stop - start;
635 *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
636 *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
641 static void *alloc_mmap(void *address){
642 void *map_address, *best_address;
643 BLASULONG best, start, current, original;
647 /* Just give up use advanced operation */
648 map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
651 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
655 #if defined(OS_LINUX) && !defined(NO_WARMUP)
656 if (hot_alloc == 0) {
657 map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
660 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
666 map_address = mmap(NULL, allocation_block_size * SCALING,
667 MMAP_ACCESS, MMAP_POLICY, -1, 0);
669 if (map_address != (void *)-1) {
674 ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
677 perror("OpenBLAS alloc_mmap:");
678 printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
682 my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
687 allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
689 start = (BLASULONG)map_address;
690 current = (SCALING - 1) * allocation_block_size;
693 while(current > 0 && current <= original) {
694 *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
699 *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
701 start = (BLASULONG)map_address;
703 best = (BLASULONG)-1;
704 best_address = map_address;
706 while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
708 current = run_bench(start, allocsize);
710 if (best > current) {
712 best_address = (void *)start;
719 if ((BLASULONG)best_address > (BLASULONG)map_address)
720 munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
722 munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
724 map_address = best_address;
726 #if defined(OS_LINUX) && !defined(NO_WARMUP)
731 #if defined(OS_LINUX) && !defined(NO_WARMUP)
735 STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
747 static void alloc_malloc_free(struct alloc_t *alloc_info){
753 static void *alloc_malloc(void *address){
757 map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
759 if (map_address == (void *)NULL) map_address = (void *)-1;
761 STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
771 void *qalloc(int flags, size_t bytes);
772 void *qfree (void *address);
774 #define QNONCACHE 0x1
778 static void alloc_qalloc_free(struct alloc_t *alloc_info){
784 static void *alloc_qalloc(void *address){
787 map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
789 if (map_address == (void *)NULL) map_address = (void *)-1;
791 STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
793 return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
800 static void alloc_windows_free(struct alloc_t *alloc_info){
802 VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
806 static void *alloc_windows(void *address){
809 map_address = VirtualAlloc(address,
810 allocation_block_size,
811 MEM_RESERVE | MEM_COMMIT,
814 if (map_address == (void *)NULL) map_address = (void *)-1;
816 STORE_RELEASE_FUNC(map_address, alloc_windows_free);
823 #ifdef ALLOC_DEVICEDRIVER
824 #ifndef DEVICEDRIVER_NAME
825 #define DEVICEDRIVER_NAME "/dev/mapper"
828 static void alloc_devicedirver_free(struct alloc_t *alloc_info){
830 int attr = alloc_info -> attr;
831 if (munmap(address, allocation_block_size)) {
832 printf("OpenBLAS : Bugphysarea unmap failed.\n");
836 printf("OpenBLAS : Bugphysarea close failed.\n");
841 static void *alloc_devicedirver(void *address){
846 if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
852 map_address = mmap(address, allocation_block_size,
853 PROT_READ | PROT_WRITE,
854 MAP_FILE | MAP_SHARED,
857 STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
866 static void alloc_shm_free(struct alloc_t *alloc_info){
868 if (shmdt(alloc_info)) {
869 printf("OpenBLAS : Shared memory unmap failed.\n");
873 static void *alloc_shm(void *address){
877 shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
879 map_address = (void *)shmat(shmid, address, 0);
881 if (map_address != (void *)-1){
884 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
887 shmctl(shmid, IPC_RMID, 0);
889 struct alloc_t *alloc_info = (struct alloc_t *)map_address;
890 alloc_info->release_func = alloc_shm_free;
891 alloc_info->attr = shmid;
897 #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
899 static void alloc_hugetlb_free(struct alloc_t *alloc_info){
901 #if defined(OS_LINUX) || defined(OS_AIX)
902 if (shmdt(alloc_info)) {
903 printf("OpenBLAS : Hugepage unmap failed.\n");
909 munmap(alloc_info, allocation_block_size);
915 VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
921 static void *alloc_hugetlb(void *address){
923 void *map_address = (void *)-1;
925 #if defined(OS_LINUX) || defined(OS_AIX)
928 shmid = shmget(IPC_PRIVATE, allocation_block_size,
933 SHM_LGPAGE | SHM_PIN |
935 IPC_CREAT | SHM_R | SHM_W);
938 map_address = (void *)shmat(shmid, address, SHM_RND);
941 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
944 if (map_address != (void *)-1){
945 shmctl(shmid, IPC_RMID, 0);
951 struct memcntl_mha mha;
953 mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
955 mha.mha_pagesize = HUGE_PAGESIZE;
956 memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
958 map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
966 if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
968 tp.PrivilegeCount = 1;
969 tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
971 if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
976 if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
981 map_address = (void *)VirtualAlloc(address,
982 allocation_block_size,
983 MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
986 tp.Privileges[0].Attributes = 0;
987 AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
989 if (map_address == (void *)NULL) map_address = (void *)-1;
993 STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
1001 #ifdef ALLOC_HUGETLBFILE
1003 static int hugetlb_pid = 0;
1005 static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
1007 int attr = alloc_info -> attr;
1008 if (munmap(alloc_info, allocation_block_size)) {
1009 printf("OpenBLAS : HugeTLBfs unmap failed.\n");
1013 printf("OpenBLAS : HugeTLBfs close failed.\n");
1017 static void *alloc_hugetlbfile(void *address){
1019 void *map_address = (void *)-1;
1023 if (!hugetlb_pid) hugetlb_pid = getpid();
1025 sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
1027 if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
1033 map_address = mmap(address, allocation_block_size,
1034 PROT_READ | PROT_WRITE,
1038 STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
1046 static BLASULONG base_address = 0UL;
1048 static BLASULONG base_address = BASE_ADDRESS;
1051 #if __STDC_VERSION__ >= 201112L
1052 static _Atomic int memory_initialized = 0;
1054 static volatile int memory_initialized = 0;
1057 /* Memory allocation routine */
1058 /* procpos ... indicates where it comes from */
1059 /* 0 : Level 3 functions */
1060 /* 1 : Level 2 functions */
1063 static void blas_memory_cleanup(void* ptr){
1065 struct alloc_t ** table = (struct alloc_t **)ptr;
1067 for (pos = 0; pos < NUM_BUFFERS; pos ++){
1068 struct alloc_t *alloc_info = table[pos];
1070 alloc_info->release_func(alloc_info);
1071 table[pos] = (void *)0;
1078 static void blas_memory_init(){
1080 # if defined(OS_WINDOWS)
1081 local_storage_key = TlsAlloc();
1083 pthread_key_create(&local_storage_key, blas_memory_cleanup);
1084 # endif /* defined(OS_WINDOWS) */
1085 #endif /* defined(SMP) */
1088 void *blas_memory_alloc(int procpos){
1094 void *(*memoryalloc[])(void *address) = {
1095 #ifdef ALLOC_DEVICEDRIVER
1098 /* Hugetlb implicitly assumes ALLOC_SHM */
1102 #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
1111 #ifdef ALLOC_WINDOWS
1119 void *(**func)(void *address);
1120 struct alloc_t * alloc_info;
1121 struct alloc_t ** alloc_table;
1124 #if defined(SMP) && !defined(USE_OPENMP)
1126 LOCK_COMMAND(&alloc_lock);
1127 mi=memory_initialized;
1128 UNLOCK_COMMAND(&alloc_lock);
1129 if (!LIKELY_ONE(mi)) {
1131 if (!LIKELY_ONE(memory_initialized)) {
1133 #if defined(SMP) && !defined(USE_OPENMP)
1134 /* Only allow a single thread to initialize memory system */
1135 LOCK_COMMAND(&alloc_lock);
1137 if (!memory_initialized) {
1141 gotoblas_dynamic_init();
1144 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1145 gotoblas_affinity_init();
1149 if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
1152 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
1153 #ifndef DYNAMIC_ARCH
1154 blas_set_parameter();
1158 memory_initialized = 1;
1160 #if defined(SMP) && !defined(USE_OPENMP)
1162 UNLOCK_COMMAND(&alloc_lock);
1167 printf("Alloc Start ...\n");
1171 alloc_table = get_memory_table();
1173 if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
1176 } while (position < NUM_BUFFERS);
1183 printf(" Position -> %d\n", position);
1186 alloc_info = alloc_table[position];
1190 printf("Allocation Start : %lx\n", base_address);
1193 map_address = (void *)-1;
1195 func = &memoryalloc[0];
1197 while ((func != NULL) && (map_address == (void *) -1)) {
1199 map_address = (*func)((void *)base_address);
1201 #ifdef ALLOC_DEVICEDRIVER
1202 if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
1203 fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
1207 #ifdef ALLOC_HUGETLBFILE
1208 if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
1210 fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
1215 #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
1216 if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
1223 printf(" Success -> %08lx\n", map_address);
1225 if (((BLASLONG) map_address) == -1) base_address = 0UL;
1227 if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
1229 } while ((BLASLONG)map_address == -1);
1231 alloc_table[position] = alloc_info = map_address;
1234 printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
1239 printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
1242 alloc_info->used = 1;
1244 return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
1247 printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
1252 void blas_memory_free(void *buffer){
1255 struct alloc_t ** alloc_table;
1257 /* Since we passed an offset pointer to the caller, get back to the actual allocation */
1258 struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
1261 printf("Unmapped Start : %p ...\n", alloc_info);
1264 alloc_info->used = 0;
1267 printf("Unmap Succeeded.\n\n");
1273 alloc_table = get_memory_table();
1274 for (position = 0; position < NUM_BUFFERS; position++){
1275 if (alloc_table[position]) {
1276 printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
1283 void *blas_memory_alloc_nolock(int unused) {
1285 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
1289 void blas_memory_free_nolock(void * map_address) {
1293 void blas_shutdown(void){
1295 BLASFUNC(blas_thread_shutdown)();
1299 /* Only cleanupIf we were built for threading and TLS was initialized */
1300 if (local_storage_key)
1302 blas_memory_cleanup((void*)get_memory_table());
1307 base_address = BASE_ADDRESS;
1313 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1316 #if defined(USE_PTHREAD_LOCK)
1317 static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
1318 #elif defined(USE_PTHREAD_SPINLOCK)
1319 static pthread_spinlock_t init_lock = 0;
1321 static BLASULONG init_lock = 0UL;
1325 static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
1326 void *sa, void *sb, BLASLONG pos) {
1328 #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
1333 size = allocation_block_size - PAGESIZE;
1334 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1336 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1337 if (hot_alloc != 2) {
1341 LOCK_COMMAND(&init_lock);
1345 *(int *)buffer = size;
1351 UNLOCK_COMMAND(&init_lock);
1354 size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
1355 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1358 *(int *)buffer = size;
1363 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1372 static void _init_thread_memory(void *buffer) {
1374 blas_queue_t queue[MAX_CPU_NUMBER];
1377 for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
1379 blas_queue_init(&queue[num_cpu]);
1380 queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
1381 queue[num_cpu].routine = &_touch_memory;
1382 queue[num_cpu].args = NULL;
1383 queue[num_cpu].next = &queue[num_cpu + 1];
1386 queue[num_cpu - 1].next = NULL;
1387 queue[0].sa = buffer;
1389 exec_blas(num_cpu, queue);
1394 static void gotoblas_memory_init(void) {
1400 buffer = (void *)blas_memory_alloc(0);
1403 if (blas_cpu_number == 0) blas_get_cpu_number();
1405 if (blas_server_avail == 0) blas_thread_init();
1408 _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
1412 _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
1416 blas_memory_free(buffer);
1420 /* Initialization for all function; this function should be called before main */
1422 static int gotoblas_initialized = 0;
1423 extern void openblas_read_env();
1425 void CONSTRUCTOR gotoblas_init(void) {
1427 if (gotoblas_initialized) return;
1430 openblas_fork_handler();
1433 openblas_read_env();
1440 gotoblas_dynamic_init();
1443 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1444 gotoblas_affinity_init();
1447 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1448 gotoblas_memory_init();
1451 //#if defined(OS_LINUX)
1453 struct rlimit curlimit;
1454 if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
1456 if ( curlimit.rlim_cur != curlimit.rlim_max )
1458 curlimit.rlim_cur = curlimit.rlim_max;
1459 setrlimit(RLIMIT_STACK, &curlimit);
1465 if (blas_cpu_number == 0) blas_get_cpu_number();
1467 if (blas_server_avail == 0) blas_thread_init();
1471 #ifdef FUNCTION_PROFILE
1472 gotoblas_profile_init();
1475 gotoblas_initialized = 1;
1483 void DESTRUCTOR gotoblas_quit(void) {
1485 if (gotoblas_initialized == 0) return;
1493 #ifdef FUNCTION_PROFILE
1494 gotoblas_profile_quit();
1497 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1498 gotoblas_affinity_quit();
1502 gotoblas_dynamic_quit();
1505 gotoblas_initialized = 0;
1512 #if defined(_MSC_VER) && !defined(__clang__)
1513 BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
1515 switch (ul_reason_for_call)
1517 case DLL_PROCESS_ATTACH:
1520 case DLL_THREAD_ATTACH:
1522 case DLL_THREAD_DETACH:
1524 blas_memory_cleanup((void*)get_memory_table());
1527 case DLL_PROCESS_DETACH:
1537 This is to allow static linking.
1538 Code adapted from Google performance tools:
1539 https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
1541 https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
1542 http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
1544 static int on_process_term(void)
1550 #pragma comment(linker, "/INCLUDE:_tls_used")
1552 #pragma comment(linker, "/INCLUDE:__tls_used")
1556 #pragma const_seg(".CRT$XLB")
1558 #pragma data_seg(".CRT$XLB")
1560 static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
1568 #pragma const_seg(".CRT$XTU")
1570 #pragma data_seg(".CRT$XTU")
1572 static int(*p_process_term)(void) = on_process_term;
1580 #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
1581 /* Don't call me; this is just work around for PGI / Sun bug */
1582 void gotoblas_dummy_for_PGI(void) {
1588 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
1589 asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
1591 asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
1592 asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
1601 #define ALLOC_WINDOWS
1602 #ifndef MEM_LARGE_PAGES
1603 #define MEM_LARGE_PAGES 0x20000000
1607 #define ALLOC_MALLOC
1615 #include <sys/mman.h>
1617 #include <sys/shm.h>
1619 #include <sys/ipc.h>
1622 #include <sys/types.h>
1625 #include <sys/sysinfo.h>
1628 #include <linux/unistd.h>
1629 #include <sys/syscall.h>
1630 #include <sys/time.h>
1631 #include <sys/resource.h>
1634 #if defined(OS_FREEBSD) || defined(OS_DARWIN)
1635 #include <sys/sysctl.h>
1636 #include <sys/resource.h>
1639 #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
1642 #define printf _cprintf
1647 #ifndef MPOL_PREFERRED
1648 #define MPOL_PREFERRED 1
1653 #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
1658 #define SHM_HUGETLB 04000
1661 #ifndef FIXED_PAGESIZE
1662 #define FIXED_PAGESIZE 4096
1665 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
1667 #if defined(_MSC_VER) && !defined(__clang__)
1668 #define CONSTRUCTOR __cdecl
1669 #define DESTRUCTOR __cdecl
1670 #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
1671 #define CONSTRUCTOR __attribute__ ((constructor))
1672 #define DESTRUCTOR __attribute__ ((destructor))
1674 #define CONSTRUCTOR __attribute__ ((constructor(101)))
1675 #define DESTRUCTOR __attribute__ ((destructor(101)))
1679 gotoblas_t *gotoblas = NULL;
1681 extern void openblas_warning(int verbose, const char * msg);
1685 #define blas_cpu_number 1
1686 #define blas_num_threads 1
1688 /* Dummy Function */
1689 int goto_get_num_procs (void) { return 1;};
1690 void goto_set_num_threads(int num_threads) {};
1694 #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
1696 int get_num_procs(void);
1698 int get_num_procs(void) {
1699 static int nums = 0;
1705 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1706 #if !defined(OS_LINUX)
1710 #if !defined(__GLIBC_PREREQ)
1713 #if !__GLIBC_PREREQ(2, 3)
1717 #if !__GLIBC_PREREQ(2, 7)
1718 ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
1719 if (ret!=0) return nums;
1721 #if !__GLIBC_PREREQ(2, 6)
1722 for (i=0;i<nums;i++)
1723 if (CPU_ISSET(i,cpusetp)) n++;
1726 nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
1730 cpusetp = CPU_ALLOC(nums);
1731 if (cpusetp == NULL) return nums;
1732 size = CPU_ALLOC_SIZE(nums);
1733 ret = sched_getaffinity(0,size,cpusetp);
1734 if (ret!=0) return nums;
1735 nums = CPU_COUNT_S(size,cpusetp);
1745 int get_num_procs(void) {
1746 static int nums = 0;
1747 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1753 int get_num_procs(void) {
1754 static int nums = 0;
1755 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1761 int get_num_procs(void) {
1762 static int nums = 0;
1763 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1770 int get_num_procs(void) {
1772 static int nums = 0;
1776 SYSTEM_INFO sysinfo;
1778 GetSystemInfo(&sysinfo);
1780 nums = sysinfo.dwNumberOfProcessors;
1788 #if defined(OS_FREEBSD)
1790 int get_num_procs(void) {
1792 static int nums = 0;
1801 sysctl(m, 2, &nums, &len, NULL, 0);
1809 #if defined(OS_DARWIN)
1810 int get_num_procs(void) {
1811 static int nums = 0;
1815 sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
1820 void set_stack_limit(int limitMB){
1825 StackSize=limitMB*1024*1024;
1826 result=getrlimit(RLIMIT_STACK, &rl);
1828 if(rl.rlim_cur < StackSize){
1829 rl.rlim_cur=StackSize;
1830 result=setrlimit(RLIMIT_STACK, &rl);
1832 fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
1842 OpenBLAS uses the numbers of CPU cores in multithreading.
1843 It can be set by openblas_set_num_threads(int num_threads);
1845 int blas_cpu_number = 0;
1847 The numbers of threads in the thread pool.
1848 This value is equal or large than blas_cpu_number. This means some threads are sleep.
1850 int blas_num_threads = 0;
1852 int goto_get_num_procs (void) {
1853 return blas_cpu_number;
1856 void openblas_fork_handler()
1858 // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
1859 // built with "make USE_OPENMP=0".
1860 // Hanging can still happen when OpenBLAS is built against the libgomp
1861 // implementation of OpenMP. The problem is tracked at:
1862 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
1863 // In the mean time build with USE_OPENMP=0 or link against another
1864 // implementation of OpenMP.
1865 #if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
1867 err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
1869 openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
1873 extern int openblas_num_threads_env();
1874 extern int openblas_goto_num_threads_env();
1875 extern int openblas_omp_num_threads_env();
1877 int blas_get_cpu_number(void){
1878 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
1881 int blas_goto_num = 0;
1882 int blas_omp_num = 0;
1884 if (blas_num_threads) return blas_num_threads;
1886 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
1887 max_num = get_num_procs();
1892 blas_goto_num=openblas_num_threads_env();
1893 if (blas_goto_num < 0) blas_goto_num = 0;
1895 if (blas_goto_num == 0) {
1896 blas_goto_num=openblas_goto_num_threads_env();
1897 if (blas_goto_num < 0) blas_goto_num = 0;
1903 blas_omp_num=openblas_omp_num_threads_env();
1904 if (blas_omp_num < 0) blas_omp_num = 0;
1906 if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
1907 else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
1908 else blas_num_threads = MAX_CPU_NUMBER;
1910 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
1911 if (blas_num_threads > max_num) blas_num_threads = max_num;
1914 if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
1917 printf( "Adjusted number of threads : %3d\n", blas_num_threads);
1920 blas_cpu_number = blas_num_threads;
1922 return blas_num_threads;
1927 int openblas_get_num_procs(void) {
1931 return get_num_procs();
1935 int openblas_get_num_threads(void) {
1939 // init blas_cpu_number if needed
1940 blas_get_cpu_number();
1941 return blas_cpu_number;
1947 void (*func)(struct release_t *);
1951 int hugetlb_allocated = 0;
1953 static struct release_t release_info[NUM_BUFFERS];
1954 static int release_pos = 0;
1956 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1957 static int hot_alloc = 0;
1960 /* Global lock for memory allocation */
1962 #if defined(USE_PTHREAD_LOCK)
1963 static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
1964 #elif defined(USE_PTHREAD_SPINLOCK)
1965 static pthread_spinlock_t alloc_lock = 0;
1967 static BLASULONG alloc_lock = 0UL;
1972 static void alloc_mmap_free(struct release_t *release){
1974 if (munmap(release -> address, BUFFER_SIZE)) {
1975 printf("OpenBLAS : munmap failed\n");
1983 static void *alloc_mmap(void *address){
1987 map_address = mmap(address,
1989 MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
1991 map_address = mmap(address,
1993 MMAP_ACCESS, MMAP_POLICY, -1, 0);
1996 if (map_address != (void *)-1) {
1997 LOCK_COMMAND(&alloc_lock);
1998 release_info[release_pos].address = map_address;
1999 release_info[release_pos].func = alloc_mmap_free;
2001 UNLOCK_COMMAND(&alloc_lock);
2005 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2013 #define BENCH_ITERATION 4
2016 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
2018 BLASULONG original, *p;
2019 BLASULONG start, stop, min;
2022 min = (BLASULONG)-1;
2024 original = *(BLASULONG *)(address + size - PAGESIZE);
2026 *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
2028 for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
2030 p = (BLASULONG *)address;
2032 count = size / PAGESIZE;
2036 for (i = 0; i < count; i ++) {
2037 p = (BLASULONG *)(*p);
2042 if (min > stop - start) min = stop - start;
2045 *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
2046 *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
2051 static void *alloc_mmap(void *address){
2052 void *map_address, *best_address;
2053 BLASULONG best, start, current;
2054 BLASULONG allocsize;
2057 /* Just give up use advanced operation */
2058 map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
2061 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2065 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2066 if (hot_alloc == 0) {
2067 map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
2070 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2076 map_address = mmap(NULL, BUFFER_SIZE * SCALING,
2077 MMAP_ACCESS, MMAP_POLICY, -1, 0);
2079 if (map_address != (void *)-1) {
2084 ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
2087 perror("OpenBLAS alloc_mmap:");
2088 printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
2092 my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
2097 allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
2099 start = (BLASULONG)map_address;
2100 current = (SCALING - 1) * BUFFER_SIZE;
2102 while(current > 0) {
2103 *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
2105 current -= PAGESIZE;
2108 *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
2110 start = (BLASULONG)map_address;
2112 best = (BLASULONG)-1;
2113 best_address = map_address;
2115 while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
2117 current = run_bench(start, allocsize);
2119 if (best > current) {
2121 best_address = (void *)start;
2128 if ((BLASULONG)best_address > (BLASULONG)map_address)
2129 munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
2131 munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
2133 map_address = best_address;
2135 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2140 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2143 LOCK_COMMAND(&alloc_lock);
2145 if (map_address != (void *)-1) {
2146 release_info[release_pos].address = map_address;
2147 release_info[release_pos].func = alloc_mmap_free;
2150 UNLOCK_COMMAND(&alloc_lock);
2162 static void alloc_malloc_free(struct release_t *release){
2164 free(release -> address);
2168 static void *alloc_malloc(void *address){
2172 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
2174 if (map_address == (void *)NULL) map_address = (void *)-1;
2176 if (map_address != (void *)-1) {
2177 release_info[release_pos].address = map_address;
2178 release_info[release_pos].func = alloc_malloc_free;
2190 void *qalloc(int flags, size_t bytes);
2191 void *qfree (void *address);
2193 #define QNONCACHE 0x1
2197 static void alloc_qalloc_free(struct release_t *release){
2199 qfree(release -> address);
2203 static void *alloc_qalloc(void *address){
2206 map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
2208 if (map_address == (void *)NULL) map_address = (void *)-1;
2210 if (map_address != (void *)-1) {
2211 release_info[release_pos].address = map_address;
2212 release_info[release_pos].func = alloc_qalloc_free;
2216 return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
2221 #ifdef ALLOC_WINDOWS
2223 static void alloc_windows_free(struct release_t *release){
2225 VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
2229 static void *alloc_windows(void *address){
2232 map_address = VirtualAlloc(address,
2234 MEM_RESERVE | MEM_COMMIT,
2237 if (map_address == (void *)NULL) map_address = (void *)-1;
2239 if (map_address != (void *)-1) {
2240 release_info[release_pos].address = map_address;
2241 release_info[release_pos].func = alloc_windows_free;
2250 #ifdef ALLOC_DEVICEDRIVER
2251 #ifndef DEVICEDRIVER_NAME
2252 #define DEVICEDRIVER_NAME "/dev/mapper"
2255 static void alloc_devicedirver_free(struct release_t *release){
2257 if (munmap(release -> address, BUFFER_SIZE)) {
2258 printf("OpenBLAS : Bugphysarea unmap failed.\n");
2261 if (close(release -> attr)) {
2262 printf("OpenBLAS : Bugphysarea close failed.\n");
2267 static void *alloc_devicedirver(void *address){
2272 if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
2278 map_address = mmap(address, BUFFER_SIZE,
2279 PROT_READ | PROT_WRITE,
2280 MAP_FILE | MAP_SHARED,
2283 if (map_address != (void *)-1) {
2284 release_info[release_pos].address = map_address;
2285 release_info[release_pos].attr = fd;
2286 release_info[release_pos].func = alloc_devicedirver_free;
2297 static void alloc_shm_free(struct release_t *release){
2299 if (shmdt(release -> address)) {
2300 printf("OpenBLAS : Shared memory unmap failed.\n");
2304 static void *alloc_shm(void *address){
2308 shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
2310 map_address = (void *)shmat(shmid, address, 0);
2312 if (map_address != (void *)-1){
2315 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2318 shmctl(shmid, IPC_RMID, 0);
2320 release_info[release_pos].address = map_address;
2321 release_info[release_pos].attr = shmid;
2322 release_info[release_pos].func = alloc_shm_free;
2329 #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
2331 static void alloc_hugetlb_free(struct release_t *release){
2333 #if defined(OS_LINUX) || defined(OS_AIX)
2334 if (shmdt(release -> address)) {
2335 printf("OpenBLAS : Hugepage unmap failed.\n");
2341 munmap(release -> address, BUFFER_SIZE);
2347 VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
2353 static void *alloc_hugetlb(void *address){
2355 void *map_address = (void *)-1;
2357 #if defined(OS_LINUX) || defined(OS_AIX)
2360 shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
2365 SHM_LGPAGE | SHM_PIN |
2367 IPC_CREAT | SHM_R | SHM_W);
2370 map_address = (void *)shmat(shmid, address, SHM_RND);
2373 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2376 if (map_address != (void *)-1){
2377 shmctl(shmid, IPC_RMID, 0);
2383 struct memcntl_mha mha;
2385 mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
2387 mha.mha_pagesize = HUGE_PAGESIZE;
2388 memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
2390 map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
2396 TOKEN_PRIVILEGES tp;
2398 if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
2400 tp.PrivilegeCount = 1;
2401 tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
2403 if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
2404 CloseHandle(hToken);
2408 if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
2409 CloseHandle(hToken);
2413 map_address = (void *)VirtualAlloc(address,
2415 MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
2418 tp.Privileges[0].Attributes = 0;
2419 AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
2421 if (map_address == (void *)NULL) map_address = (void *)-1;
2425 if (map_address != (void *)-1){
2426 release_info[release_pos].address = map_address;
2427 release_info[release_pos].func = alloc_hugetlb_free;
2437 #ifdef ALLOC_HUGETLBFILE
2439 static int hugetlb_pid = 0;
2441 static void alloc_hugetlbfile_free(struct release_t *release){
2443 if (munmap(release -> address, BUFFER_SIZE)) {
2444 printf("OpenBLAS : HugeTLBfs unmap failed.\n");
2447 if (close(release -> attr)) {
2448 printf("OpenBLAS : HugeTLBfs close failed.\n");
2452 static void *alloc_hugetlbfile(void *address){
2454 void *map_address = (void *)-1;
2458 if (!hugetlb_pid) hugetlb_pid = getpid();
2460 sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
2462 if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
2468 map_address = mmap(address, BUFFER_SIZE,
2469 PROT_READ | PROT_WRITE,
2473 if (map_address != (void *)-1) {
2474 release_info[release_pos].address = map_address;
2475 release_info[release_pos].attr = fd;
2476 release_info[release_pos].func = alloc_hugetlbfile_free;
2486 static BLASULONG base_address = 0UL;
2488 static BLASULONG base_address = BASE_ADDRESS;
2491 static volatile struct {
2494 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2504 } memory[NUM_BUFFERS];
2506 static int memory_initialized = 0;
2508 /* Memory allocation routine */
2509 /* procpos ... indicates where it comes from */
2510 /* 0 : Level 3 functions */
2511 /* 1 : Level 2 functions */
2514 void *blas_memory_alloc(int procpos){
2517 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2523 void *(*memoryalloc[])(void *address) = {
2524 #ifdef ALLOC_DEVICEDRIVER
2527 /* Hugetlb implicitly assumes ALLOC_SHM */
2531 #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
2540 #ifdef ALLOC_WINDOWS
2548 void *(**func)(void *address);
2549 LOCK_COMMAND(&alloc_lock);
2551 if (!memory_initialized) {
2553 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2554 for (position = 0; position < NUM_BUFFERS; position ++){
2555 memory[position].addr = (void *)0;
2556 memory[position].pos = -1;
2557 memory[position].used = 0;
2558 memory[position].lock = 0;
2563 gotoblas_dynamic_init();
2566 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
2567 gotoblas_affinity_init();
2571 if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
2574 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
2575 #ifndef DYNAMIC_ARCH
2576 blas_set_parameter();
2580 memory_initialized = 1;
2583 UNLOCK_COMMAND(&alloc_lock);
2586 printf("Alloc Start ...\n");
2589 /* #if defined(WHEREAMI) && !defined(USE_OPENMP)
2594 while (position >= NUM_BUFFERS) position >>= 1;
2597 if (!memory[position].used && (memory[position].pos == mypos)) {
2598 LOCK_COMMAND(&alloc_lock);
2599 // blas_lock(&memory[position].lock);
2601 if (!memory[position].used) goto allocation;
2603 UNLOCK_COMMAND(&alloc_lock);
2604 // blas_unlock(&memory[position].lock);
2609 } while (position < NUM_BUFFERS);
2616 LOCK_COMMAND(&alloc_lock);
2618 /* if (!memory[position].used) { */
2619 /* blas_lock(&memory[position].lock);*/
2621 if (!memory[position].used) goto allocation;
2623 /* blas_unlock(&memory[position].lock);*/
2628 } while (position < NUM_BUFFERS);
2629 UNLOCK_COMMAND(&alloc_lock);
2636 printf(" Position -> %d\n", position);
2639 memory[position].used = 1;
2641 UNLOCK_COMMAND(&alloc_lock);
2642 /* blas_unlock(&memory[position].lock);*/
2644 if (!memory[position].addr) {
2647 printf("Allocation Start : %lx\n", base_address);
2650 map_address = (void *)-1;
2652 func = &memoryalloc[0];
2654 while ((func != NULL) && (map_address == (void *) -1)) {
2656 map_address = (*func)((void *)base_address);
2658 #ifdef ALLOC_DEVICEDRIVER
2659 if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
2660 fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
2664 #ifdef ALLOC_HUGETLBFILE
2665 if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
2667 fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
2672 #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
2673 if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
2680 printf(" Success -> %08lx\n", map_address);
2682 if (((BLASLONG) map_address) == -1) base_address = 0UL;
2684 if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
2686 } while ((BLASLONG)map_address == -1);
2688 LOCK_COMMAND(&alloc_lock);
2689 memory[position].addr = map_address;
2690 UNLOCK_COMMAND(&alloc_lock);
2693 printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
2697 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2699 if (memory[position].pos == -1) memory[position].pos = mypos;
2705 if (memory_initialized == 1) {
2707 LOCK_COMMAND(&alloc_lock);
2709 if (memory_initialized == 1) {
2711 if (!gotoblas) gotoblas_dynamic_init();
2713 memory_initialized = 2;
2716 UNLOCK_COMMAND(&alloc_lock);
2723 printf("Mapped : %p %3d\n\n",
2724 (void *)memory[position].addr, position);
2727 return (void *)memory[position].addr;
2730 printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
2735 void blas_memory_free(void *free_area){
2740 printf("Unmapped Start : %p ...\n", free_area);
2744 LOCK_COMMAND(&alloc_lock);
2746 while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
2749 if (memory[position].addr != free_area) goto error;
2752 printf(" Position : %d\n", position);
2755 // arm: ensure all writes are finished before other thread takes this memory
2758 memory[position].used = 0;
2759 UNLOCK_COMMAND(&alloc_lock);
2762 printf("Unmap Succeeded.\n\n");
2768 printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
2771 for (position = 0; position < NUM_BUFFERS; position++)
2772 printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
2774 UNLOCK_COMMAND(&alloc_lock);
2779 void *blas_memory_alloc_nolock(int unused) {
2781 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
2785 void blas_memory_free_nolock(void * map_address) {
2789 void blas_shutdown(void){
2794 BLASFUNC(blas_thread_shutdown)();
2797 LOCK_COMMAND(&alloc_lock);
2799 for (pos = 0; pos < release_pos; pos ++) {
2800 release_info[pos].func(&release_info[pos]);
2806 base_address = BASE_ADDRESS;
2809 for (pos = 0; pos < NUM_BUFFERS; pos ++){
2810 memory[pos].addr = (void *)0;
2811 memory[pos].used = 0;
2812 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2813 memory[pos].pos = -1;
2815 memory[pos].lock = 0;
2818 UNLOCK_COMMAND(&alloc_lock);
2823 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2826 #if defined(USE_PTHREAD_LOCK)
2827 static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
2828 #elif defined(USE_PTHREAD_SPINLOCK)
2829 static pthread_spinlock_t init_lock = 0;
2831 static BLASULONG init_lock = 0UL;
2835 static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
2836 void *sa, void *sb, BLASLONG pos) {
2838 #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
2843 size = BUFFER_SIZE - PAGESIZE;
2844 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
2846 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2847 if (hot_alloc != 2) {
2851 LOCK_COMMAND(&init_lock);
2855 *(int *)buffer = size;
2861 UNLOCK_COMMAND(&init_lock);
2864 size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
2865 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
2868 *(int *)buffer = size;
2873 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2882 static void _init_thread_memory(void *buffer) {
2884 blas_queue_t queue[MAX_CPU_NUMBER];
2887 for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
2889 blas_queue_init(&queue[num_cpu]);
2890 queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
2891 queue[num_cpu].routine = &_touch_memory;
2892 queue[num_cpu].args = NULL;
2893 queue[num_cpu].next = &queue[num_cpu + 1];
2896 queue[num_cpu - 1].next = NULL;
2897 queue[0].sa = buffer;
2899 exec_blas(num_cpu, queue);
2904 static void gotoblas_memory_init(void) {
2910 buffer = (void *)blas_memory_alloc(0);
2913 if (blas_cpu_number == 0) blas_get_cpu_number();
2915 if (blas_server_avail == 0) blas_thread_init();
2918 _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
2922 _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
2926 blas_memory_free(buffer);
2930 /* Initialization for all function; this function should be called before main */
2932 static int gotoblas_initialized = 0;
2933 extern void openblas_read_env();
2935 void CONSTRUCTOR gotoblas_init(void) {
2937 if (gotoblas_initialized) return;
2940 openblas_fork_handler();
2943 openblas_read_env();
2950 gotoblas_dynamic_init();
2953 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
2954 gotoblas_affinity_init();
2957 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2958 gotoblas_memory_init();
2961 //#if defined(OS_LINUX)
2963 struct rlimit curlimit;
2964 if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
2966 if ( curlimit.rlim_cur != curlimit.rlim_max )
2968 curlimit.rlim_cur = curlimit.rlim_max;
2969 setrlimit(RLIMIT_STACK, &curlimit);
2975 if (blas_cpu_number == 0) blas_get_cpu_number();
2977 if (blas_server_avail == 0) blas_thread_init();
2981 #ifdef FUNCTION_PROFILE
2982 gotoblas_profile_init();
2985 gotoblas_initialized = 1;
2993 void DESTRUCTOR gotoblas_quit(void) {
2995 if (gotoblas_initialized == 0) return;
3003 #ifdef FUNCTION_PROFILE
3004 gotoblas_profile_quit();
3007 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
3008 gotoblas_affinity_quit();
3012 gotoblas_dynamic_quit();
3015 gotoblas_initialized = 0;
3022 #if defined(_MSC_VER) && !defined(__clang__)
3023 BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
3025 switch (ul_reason_for_call)
3027 case DLL_PROCESS_ATTACH:
3030 case DLL_THREAD_ATTACH:
3032 case DLL_THREAD_DETACH:
3034 case DLL_PROCESS_DETACH:
3044 This is to allow static linking.
3045 Code adapted from Google performance tools:
3046 https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
3048 https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
3049 http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
3051 static int on_process_term(void)
3057 #pragma comment(linker, "/INCLUDE:_tls_used")
3059 #pragma comment(linker, "/INCLUDE:__tls_used")
3063 #pragma const_seg(".CRT$XLB")
3065 #pragma data_seg(".CRT$XLB")
3067 static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
3075 #pragma const_seg(".CRT$XTU")
3077 #pragma data_seg(".CRT$XTU")
3079 static int(*p_process_term)(void) = on_process_term;
3087 #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
3088 /* Don't call me; this is just work around for PGI / Sun bug */
3089 void gotoblas_dummy_for_PGI(void) {
3095 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
3096 asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
3098 asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
3099 asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");