1 /*****************************************************************************
2 Copyright (c) 2011-2014, The OpenBLAS Project
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
9 1. Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in
14 the documentation and/or other materials provided with the
16 3. Neither the name of the OpenBLAS project nor the names of
17 its contributors may be used to endorse or promote products
18 derived from this software without specific prior written
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 **********************************************************************************/
34 /*********************************************************************/
35 /* Copyright 2009, 2010 The University of Texas at Austin. */
36 /* All rights reserved. */
38 /* Redistribution and use in source and binary forms, with or */
39 /* without modification, are permitted provided that the following */
40 /* conditions are met: */
42 /* 1. Redistributions of source code must retain the above */
43 /* copyright notice, this list of conditions and the following */
46 /* 2. Redistributions in binary form must reproduce the above */
47 /* copyright notice, this list of conditions and the following */
48 /* disclaimer in the documentation and/or other materials */
49 /* provided with the distribution. */
51 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
52 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
53 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
54 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
55 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
56 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
57 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
58 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
59 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
60 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
61 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
62 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
63 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
64 /* POSSIBILITY OF SUCH DAMAGE. */
66 /* The views and conclusions contained in the software and */
67 /* documentation are those of the authors and should not be */
68 /* interpreted as representing official policies, either expressed */
69 /* or implied, of The University of Texas at Austin. */
70 /*********************************************************************/
76 #if defined(USE_TLS) && defined(SMP)
83 #if defined(__GLIBC_PREREQ)
84 #if !__GLIBC_PREREQ(2,20)
90 #if defined(COMPILE_TLS)
94 #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
96 #ifndef MEM_LARGE_PAGES
97 #define MEM_LARGE_PAGES 0x20000000
108 #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
109 #include <sys/mman.h>
116 #include <sys/types.h>
119 #include <sys/sysinfo.h>
122 #include <linux/unistd.h>
123 #include <sys/syscall.h>
124 #include <sys/time.h>
125 #include <sys/resource.h>
132 #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
133 #include <sys/sysctl.h>
134 #include <sys/resource.h>
137 #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
140 #define printf _cprintf
145 #ifndef MPOL_PREFERRED
146 #define MPOL_PREFERRED 1
151 #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
156 #define SHM_HUGETLB 04000
159 #ifndef FIXED_PAGESIZE
160 #define FIXED_PAGESIZE 4096
163 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
165 #if defined(_MSC_VER) && !defined(__clang__)
166 #define CONSTRUCTOR __cdecl
167 #define DESTRUCTOR __cdecl
168 #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
169 #define CONSTRUCTOR __attribute__ ((constructor))
170 #define DESTRUCTOR __attribute__ ((destructor))
171 #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
172 #define CONSTRUCTOR __attribute__ ((constructor(101)))
173 #define DESTRUCTOR __attribute__ ((destructor(101)))
175 #define CONSTRUCTOR __attribute__ ((constructor))
176 #define DESTRUCTOR __attribute__ ((destructor))
180 gotoblas_t *gotoblas = NULL;
182 extern void openblas_warning(int verbose, const char * msg);
186 #define blas_cpu_number 1
187 #define blas_num_threads 1
190 int goto_get_num_procs (void) { return 1;};
191 void goto_set_num_threads(int num_threads) {};
195 #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
197 int get_num_procs(void);
199 int get_num_procs(void) {
206 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
207 #if !defined(OS_LINUX)
211 #if !defined(__GLIBC_PREREQ)
214 #if !__GLIBC_PREREQ(2, 3)
218 #if !__GLIBC_PREREQ(2, 7)
219 ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
220 if (ret!=0) return nums;
222 #if !__GLIBC_PREREQ(2, 6)
224 if (CPU_ISSET(i,cpusetp)) n++;
227 nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
231 cpusetp = CPU_ALLOC(nums);
232 if (cpusetp == NULL) return nums;
233 size = CPU_ALLOC_SIZE(nums);
234 ret = sched_getaffinity(0,size,cpusetp);
235 if (ret!=0) return nums;
236 ret = CPU_COUNT_S(size,cpusetp);
237 if (ret > 0 && ret < nums) nums = ret;
247 int get_num_procs(void) {
249 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
255 int get_num_procs(void) {
257 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
263 int get_num_procs(void) {
265 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
274 int get_num_procs(void) {
282 GetSystemInfo(&sysinfo);
284 nums = sysinfo.dwNumberOfProcessors;
292 #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
294 int get_num_procs(void) {
305 sysctl(m, 2, &nums, &len, NULL, 0);
313 #if defined(OS_DARWIN)
314 int get_num_procs(void) {
319 sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
324 void set_stack_limit(int limitMB){
329 StackSize=limitMB*1024*1024;
330 result=getrlimit(RLIMIT_STACK, &rl);
332 if(rl.rlim_cur < StackSize){
333 rl.rlim_cur=StackSize;
334 result=setrlimit(RLIMIT_STACK, &rl);
336 fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
346 OpenBLAS uses the numbers of CPU cores in multithreading.
347 It can be set by openblas_set_num_threads(int num_threads);
349 int blas_cpu_number = 0;
351 The numbers of threads in the thread pool.
352 This value is equal or large than blas_cpu_number. This means some threads are sleep.
354 int blas_num_threads = 0;
356 int goto_get_num_procs (void) {
357 return blas_cpu_number;
360 static void blas_memory_init();
362 void openblas_fork_handler()
364 // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
365 // built with "make USE_OPENMP=0".
366 // Hanging can still happen when OpenBLAS is built against the libgomp
367 // implementation of OpenMP. The problem is tracked at:
368 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
369 // In the mean time build with USE_OPENMP=0 or link against another
370 // implementation of OpenMP.
371 #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
373 err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
375 openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
379 extern int openblas_num_threads_env();
380 extern int openblas_goto_num_threads_env();
381 extern int openblas_omp_num_threads_env();
383 int blas_get_cpu_number(void){
384 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
387 int blas_goto_num = 0;
388 int blas_omp_num = 0;
390 if (blas_num_threads) return blas_num_threads;
392 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
393 max_num = get_num_procs();
396 // blas_goto_num = 0;
397 #ifndef USE_OPENMP_UNUSED
398 blas_goto_num=openblas_num_threads_env();
399 if (blas_goto_num < 0) blas_goto_num = 0;
401 if (blas_goto_num == 0) {
402 blas_goto_num=openblas_goto_num_threads_env();
403 if (blas_goto_num < 0) blas_goto_num = 0;
409 blas_omp_num=openblas_omp_num_threads_env();
410 if (blas_omp_num < 0) blas_omp_num = 0;
412 if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
413 else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
414 else blas_num_threads = MAX_CPU_NUMBER;
416 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
417 if (blas_num_threads > max_num) blas_num_threads = max_num;
420 if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
423 printf( "Adjusted number of threads : %3d\n", blas_num_threads);
426 blas_cpu_number = blas_num_threads;
428 return blas_num_threads;
433 int openblas_get_num_procs(void) {
437 return get_num_procs();
441 int openblas_get_num_threads(void) {
445 // init blas_cpu_number if needed
446 blas_get_cpu_number();
447 return blas_cpu_number;
451 int hugetlb_allocated = 0;
453 #if defined(OS_WINDOWS)
454 #define LIKELY_ONE(x) (x)
456 #define LIKELY_ONE(x) (__builtin_expect(x, 1))
459 /* Stores information about the allocation and how to release it */
461 /* Whether this allocation is being used */
463 /* Any special attributes needed when releasing this allocation */
465 /* Function that can properly release this memory */
466 void (*release_func)(struct alloc_t *);
467 /* Pad to 64-byte alignment */
468 char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
471 /* Convenience macros for storing release funcs */
472 #define STORE_RELEASE_FUNC(address, func) \
473 if (address != (void *)-1) { \
474 struct alloc_t *alloc_info = (struct alloc_t *)address; \
475 alloc_info->release_func = func; \
478 #define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
479 if (address != (void *)-1) { \
480 struct alloc_t *alloc_info = (struct alloc_t *)address; \
481 alloc_info->release_func = func; \
482 alloc_info->attr = attr; \
485 /* The number of bytes that will be allocated for each buffer. When allocating
486 memory, we store an alloc_t followed by the actual buffer memory. This means
487 that each allocation always has its associated alloc_t, without the need
488 for an auxiliary tracking structure. */
489 static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
492 # if defined(OS_WINDOWS)
493 static DWORD local_storage_key = 0;
497 static pthread_key_t local_storage_key = 0;
499 # endif /* defined(OS_WINDOWS) */
500 #endif /* defined(SMP) */
502 #if defined(OS_LINUX) && !defined(NO_WARMUP)
503 static int hot_alloc = 0;
506 /* Global lock for memory allocation */
508 #if defined(USE_PTHREAD_LOCK)
509 static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
510 #elif defined(USE_PTHREAD_SPINLOCK)
511 static pthread_spinlock_t alloc_lock = 0;
513 static BLASULONG alloc_lock = 0UL;
516 #if defined(USE_PTHREAD_LOCK)
517 static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER;
518 #elif defined(USE_PTHREAD_SPINLOCK)
519 static pthread_spinlock_t key_lock = 0;
521 static BLASULONG key_lock = 0UL;
524 /* Returns a pointer to the start of the per-thread memory allocation data */
525 static __inline struct alloc_t ** get_memory_table() {
527 LOCK_COMMAND(&key_lock);
528 lsk=local_storage_key;
529 UNLOCK_COMMAND(&key_lock);
533 # if defined(OS_WINDOWS)
534 struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key);
536 struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key);
537 # endif /* defined(OS_WINDOWS) */
539 static struct alloc_t ** local_memory_table = NULL;
540 #endif /* defined(SMP) */
542 LOCK_COMMAND(&key_lock);
543 lsk=local_storage_key;
544 UNLOCK_COMMAND(&key_lock);
545 if (lsk && !local_memory_table) {
547 if (!local_memory_table) {
548 #endif /* defined(SMP) */
549 local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS);
550 memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
552 # if defined(OS_WINDOWS)
553 LOCK_COMMAND(&key_lock);
554 TlsSetValue(local_storage_key, (void*)local_memory_table);
555 UNLOCK_COMMAND(&key_lock);
557 LOCK_COMMAND(&key_lock);
558 pthread_setspecific(local_storage_key, (void*)local_memory_table);
559 UNLOCK_COMMAND(&key_lock);
560 # endif /* defined(OS_WINDOWS) */
561 #endif /* defined(SMP) */
563 return local_memory_table;
568 static void alloc_mmap_free(struct alloc_t *alloc_info){
570 if (munmap(alloc_info, allocation_block_size)) {
571 printf("OpenBLAS : munmap failed\n");
579 static void *alloc_mmap(void *address){
583 map_address = mmap(address,
584 allocation_block_size,
585 MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
587 map_address = mmap(address,
588 allocation_block_size,
589 MMAP_ACCESS, MMAP_POLICY, -1, 0);
592 STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
595 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
603 #define BENCH_ITERATION 4
606 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
608 BLASULONG original, *p;
609 BLASULONG start, stop, min;
614 original = *(BLASULONG *)(address + size - PAGESIZE);
616 *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
618 for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
620 p = (BLASULONG *)address;
622 count = size / PAGESIZE;
626 for (i = 0; i < count; i ++) {
627 p = (BLASULONG *)(*p);
632 if (min > stop - start) min = stop - start;
635 *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
636 *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
641 static void *alloc_mmap(void *address){
642 void *map_address, *best_address;
643 BLASULONG best, start, current, original;
647 /* Just give up use advanced operation */
648 map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
651 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
655 #if defined(OS_LINUX) && !defined(NO_WARMUP)
656 if (hot_alloc == 0) {
657 map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
660 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
666 map_address = mmap(NULL, allocation_block_size * SCALING,
667 MMAP_ACCESS, MMAP_POLICY, -1, 0);
669 if (map_address != (void *)-1) {
674 ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
677 perror("OpenBLAS alloc_mmap:");
678 printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
682 my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
687 allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
689 start = (BLASULONG)map_address;
690 current = (SCALING - 1) * allocation_block_size;
693 while(current > 0 && current <= original) {
694 *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
699 *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
701 start = (BLASULONG)map_address;
703 best = (BLASULONG)-1;
704 best_address = map_address;
706 while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
708 current = run_bench(start, allocsize);
710 if (best > current) {
712 best_address = (void *)start;
719 if ((BLASULONG)best_address > (BLASULONG)map_address)
720 munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
722 munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
724 map_address = best_address;
726 #if defined(OS_LINUX) && !defined(NO_WARMUP)
731 #if defined(OS_LINUX) && !defined(NO_WARMUP)
735 STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
747 static void alloc_malloc_free(struct alloc_t *alloc_info){
753 static void *alloc_malloc(void *address){
757 map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
759 if (map_address == (void *)NULL) map_address = (void *)-1;
761 STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
771 void *qalloc(int flags, size_t bytes);
772 void *qfree (void *address);
774 #define QNONCACHE 0x1
778 static void alloc_qalloc_free(struct alloc_t *alloc_info){
784 static void *alloc_qalloc(void *address){
787 map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
789 if (map_address == (void *)NULL) map_address = (void *)-1;
791 STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
793 return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
800 static void alloc_windows_free(struct alloc_t *alloc_info){
802 VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
806 static void *alloc_windows(void *address){
809 map_address = VirtualAlloc(address,
810 allocation_block_size,
811 MEM_RESERVE | MEM_COMMIT,
814 if (map_address == (void *)NULL) map_address = (void *)-1;
816 STORE_RELEASE_FUNC(map_address, alloc_windows_free);
823 #ifdef ALLOC_DEVICEDRIVER
824 #ifndef DEVICEDRIVER_NAME
825 #define DEVICEDRIVER_NAME "/dev/mapper"
828 static void alloc_devicedirver_free(struct alloc_t *alloc_info){
830 int attr = alloc_info -> attr;
831 if (munmap(address, allocation_block_size)) {
832 printf("OpenBLAS : Bugphysarea unmap failed.\n");
836 printf("OpenBLAS : Bugphysarea close failed.\n");
841 static void *alloc_devicedirver(void *address){
846 if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
852 map_address = mmap(address, allocation_block_size,
853 PROT_READ | PROT_WRITE,
854 MAP_FILE | MAP_SHARED,
857 STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
866 static void alloc_shm_free(struct alloc_t *alloc_info){
868 if (shmdt(alloc_info)) {
869 printf("OpenBLAS : Shared memory unmap failed.\n");
873 static void *alloc_shm(void *address){
877 shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
879 map_address = (void *)shmat(shmid, address, 0);
881 if (map_address != (void *)-1){
884 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
887 shmctl(shmid, IPC_RMID, 0);
889 struct alloc_t *alloc_info = (struct alloc_t *)map_address;
890 alloc_info->release_func = alloc_shm_free;
891 alloc_info->attr = shmid;
897 #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
899 static void alloc_hugetlb_free(struct alloc_t *alloc_info){
901 #if defined(OS_LINUX) || defined(OS_AIX)
902 if (shmdt(alloc_info)) {
903 printf("OpenBLAS : Hugepage unmap failed.\n");
909 munmap(alloc_info, allocation_block_size);
915 VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
921 static void *alloc_hugetlb(void *address){
923 void *map_address = (void *)-1;
925 #if defined(OS_LINUX) || defined(OS_AIX)
928 shmid = shmget(IPC_PRIVATE, allocation_block_size,
933 SHM_LGPAGE | SHM_PIN |
935 IPC_CREAT | SHM_R | SHM_W);
938 map_address = (void *)shmat(shmid, address, SHM_RND);
941 my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
944 if (map_address != (void *)-1){
945 shmctl(shmid, IPC_RMID, 0);
951 struct memcntl_mha mha;
953 mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
955 mha.mha_pagesize = HUGE_PAGESIZE;
956 memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
958 map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
966 if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
968 tp.PrivilegeCount = 1;
969 tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
971 if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
976 if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
981 map_address = (void *)VirtualAlloc(address,
982 allocation_block_size,
983 MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
986 tp.Privileges[0].Attributes = 0;
987 AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
989 if (map_address == (void *)NULL) map_address = (void *)-1;
993 STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
1001 #ifdef ALLOC_HUGETLBFILE
1003 static int hugetlb_pid = 0;
1005 static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
1007 int attr = alloc_info -> attr;
1008 if (munmap(alloc_info, allocation_block_size)) {
1009 printf("OpenBLAS : HugeTLBfs unmap failed.\n");
1013 printf("OpenBLAS : HugeTLBfs close failed.\n");
1017 static void *alloc_hugetlbfile(void *address){
1019 void *map_address = (void *)-1;
1023 if (!hugetlb_pid) hugetlb_pid = getpid();
1025 sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
1027 if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
1033 map_address = mmap(address, allocation_block_size,
1034 PROT_READ | PROT_WRITE,
1038 STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
1046 static BLASULONG base_address = 0UL;
1048 static BLASULONG base_address = BASE_ADDRESS;
1051 #if __STDC_VERSION__ >= 201112L
1052 static _Atomic int memory_initialized = 0;
1054 static volatile int memory_initialized = 0;
1057 /* Memory allocation routine */
1058 /* procpos ... indicates where it comes from */
1059 /* 0 : Level 3 functions */
1060 /* 1 : Level 2 functions */
1063 static void blas_memory_cleanup(void* ptr){
1065 struct alloc_t ** table = (struct alloc_t **)ptr;
1067 for (pos = 0; pos < NUM_BUFFERS; pos ++){
1068 struct alloc_t *alloc_info = table[pos];
1070 alloc_info->release_func(alloc_info);
1071 table[pos] = (void *)0;
1076 #if defined(OS_WINDOWS)
1077 TlsFree(local_storage_key);
1079 pthread_key_delete(local_storage_key);
1083 static void blas_memory_init(){
1085 # if defined(OS_WINDOWS)
1086 local_storage_key = TlsAlloc();
1088 pthread_key_create(&local_storage_key, blas_memory_cleanup);
1089 # endif /* defined(OS_WINDOWS) */
1090 #endif /* defined(SMP) */
1093 void *blas_memory_alloc(int procpos){
1099 void *(*memoryalloc[])(void *address) = {
1100 #ifdef ALLOC_DEVICEDRIVER
1103 /* Hugetlb implicitly assumes ALLOC_SHM */
1107 #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
1116 #ifdef ALLOC_WINDOWS
1124 void *(**func)(void *address);
1125 struct alloc_t * alloc_info;
1126 struct alloc_t ** alloc_table;
1129 #if defined(SMP) && !defined(USE_OPENMP)
1131 LOCK_COMMAND(&alloc_lock);
1132 mi=memory_initialized;
1133 UNLOCK_COMMAND(&alloc_lock);
1134 if (!LIKELY_ONE(mi)) {
1136 if (!LIKELY_ONE(memory_initialized)) {
1138 #if defined(SMP) && !defined(USE_OPENMP)
1139 /* Only allow a single thread to initialize memory system */
1140 LOCK_COMMAND(&alloc_lock);
1142 if (!memory_initialized) {
1146 gotoblas_dynamic_init();
1149 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1150 gotoblas_affinity_init();
1154 if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
1157 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
1158 #ifndef DYNAMIC_ARCH
1159 blas_set_parameter();
1163 memory_initialized = 1;
1165 #if defined(SMP) && !defined(USE_OPENMP)
1167 UNLOCK_COMMAND(&alloc_lock);
1172 printf("Alloc Start ...\n");
1176 alloc_table = get_memory_table();
1178 if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
1181 } while (position < NUM_BUFFERS);
1188 printf(" Position -> %d\n", position);
1191 alloc_info = alloc_table[position];
1195 printf("Allocation Start : %lx\n", base_address);
1198 map_address = (void *)-1;
1200 func = &memoryalloc[0];
1202 while ((func != NULL) && (map_address == (void *) -1)) {
1204 map_address = (*func)((void *)base_address);
1206 #ifdef ALLOC_DEVICEDRIVER
1207 if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
1208 fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
1212 #ifdef ALLOC_HUGETLBFILE
1213 if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
1215 fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
1220 #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
1221 if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
1228 printf(" Success -> %08lx\n", map_address);
1230 if (((BLASLONG) map_address) == -1) base_address = 0UL;
1232 if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
1234 } while ((BLASLONG)map_address == -1);
1236 alloc_table[position] = alloc_info = map_address;
1239 printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
1244 printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
1247 alloc_info->used = 1;
1249 return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
1252 printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
1257 void blas_memory_free(void *buffer){
1260 struct alloc_t ** alloc_table;
1262 /* Since we passed an offset pointer to the caller, get back to the actual allocation */
1263 struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
1266 printf("Unmapped Start : %p ...\n", alloc_info);
1269 alloc_info->used = 0;
1272 printf("Unmap Succeeded.\n\n");
1278 alloc_table = get_memory_table();
1279 for (position = 0; position < NUM_BUFFERS; position++){
1280 if (alloc_table[position]) {
1281 printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
1288 void *blas_memory_alloc_nolock(int unused) {
1290 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
1294 void blas_memory_free_nolock(void * map_address) {
1298 void blas_shutdown(void){
1300 BLASFUNC(blas_thread_shutdown)();
1304 /* Only cleanupIf we were built for threading and TLS was initialized */
1305 if (local_storage_key)
1307 blas_memory_cleanup((void*)get_memory_table());
1312 base_address = BASE_ADDRESS;
1318 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1321 #if defined(USE_PTHREAD_LOCK)
1322 static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
1323 #elif defined(USE_PTHREAD_SPINLOCK)
1324 static pthread_spinlock_t init_lock = 0;
1326 static BLASULONG init_lock = 0UL;
1330 static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
1331 void *sa, void *sb, BLASLONG pos) {
1333 #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
1338 size = allocation_block_size - PAGESIZE;
1339 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1341 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1342 if (hot_alloc != 2) {
1346 LOCK_COMMAND(&init_lock);
1350 *(int *)buffer = size;
1356 UNLOCK_COMMAND(&init_lock);
1359 size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
1360 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1363 *(int *)buffer = size;
1368 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1377 static void _init_thread_memory(void *buffer) {
1379 blas_queue_t queue[MAX_CPU_NUMBER];
1382 for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
1384 blas_queue_init(&queue[num_cpu]);
1385 queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
1386 queue[num_cpu].routine = &_touch_memory;
1387 queue[num_cpu].args = NULL;
1388 queue[num_cpu].next = &queue[num_cpu + 1];
1391 queue[num_cpu - 1].next = NULL;
1392 queue[0].sa = buffer;
1394 exec_blas(num_cpu, queue);
1399 static void gotoblas_memory_init(void) {
1405 buffer = (void *)blas_memory_alloc(0);
1408 if (blas_cpu_number == 0) blas_get_cpu_number();
1410 if (blas_server_avail == 0) blas_thread_init();
1413 _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
1417 _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
1421 blas_memory_free(buffer);
1425 /* Initialization for all function; this function should be called before main */
1427 static int gotoblas_initialized = 0;
1428 extern void openblas_read_env();
1430 void CONSTRUCTOR gotoblas_init(void) {
1432 if (gotoblas_initialized) return;
1435 openblas_fork_handler();
1438 openblas_read_env();
1445 gotoblas_dynamic_init();
1448 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1449 gotoblas_affinity_init();
1452 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1453 gotoblas_memory_init();
1456 //#if defined(OS_LINUX)
1458 struct rlimit curlimit;
1459 if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
1461 if ( curlimit.rlim_cur != curlimit.rlim_max )
1463 curlimit.rlim_cur = curlimit.rlim_max;
1464 setrlimit(RLIMIT_STACK, &curlimit);
1470 if (blas_cpu_number == 0) blas_get_cpu_number();
1472 if (blas_server_avail == 0) blas_thread_init();
1476 #ifdef FUNCTION_PROFILE
1477 gotoblas_profile_init();
1480 gotoblas_initialized = 1;
1488 void DESTRUCTOR gotoblas_quit(void) {
1490 if (gotoblas_initialized == 0) return;
1498 #ifdef FUNCTION_PROFILE
1499 gotoblas_profile_quit();
1502 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1503 gotoblas_affinity_quit();
1507 gotoblas_dynamic_quit();
1510 gotoblas_initialized = 0;
1517 #if defined(_MSC_VER) && !defined(__clang__)
1518 BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
1520 switch (ul_reason_for_call)
1522 case DLL_PROCESS_ATTACH:
1525 case DLL_THREAD_ATTACH:
1527 case DLL_THREAD_DETACH:
1529 blas_memory_cleanup((void*)get_memory_table());
1532 case DLL_PROCESS_DETACH:
1542 This is to allow static linking.
1543 Code adapted from Google performance tools:
1544 https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
1546 https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
1547 http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
1549 static int on_process_term(void)
1555 #pragma comment(linker, "/INCLUDE:_tls_used")
1557 #pragma comment(linker, "/INCLUDE:__tls_used")
1561 #pragma const_seg(".CRT$XLB")
1563 #pragma data_seg(".CRT$XLB")
1565 static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
1573 #pragma const_seg(".CRT$XTU")
1575 #pragma data_seg(".CRT$XTU")
1577 static int(*p_process_term)(void) = on_process_term;
1585 #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
1586 /* Don't call me; this is just work around for PGI / Sun bug */
1587 void gotoblas_dummy_for_PGI(void) {
1593 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
1594 asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
1596 asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
1597 asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
1606 #define ALLOC_WINDOWS
1607 #ifndef MEM_LARGE_PAGES
1608 #define MEM_LARGE_PAGES 0x20000000
1612 #define ALLOC_MALLOC
1620 #include <sys/mman.h>
1622 #include <sys/shm.h>
1624 #include <sys/ipc.h>
1627 #include <sys/types.h>
1630 #include <sys/sysinfo.h>
1633 #include <linux/unistd.h>
1634 #include <sys/syscall.h>
1635 #include <sys/time.h>
1636 #include <sys/resource.h>
1639 #if defined(OS_FREEBSD) || defined(OS_DARWIN)
1640 #include <sys/sysctl.h>
1641 #include <sys/resource.h>
1644 #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
1647 #define printf _cprintf
1652 #ifndef MPOL_PREFERRED
1653 #define MPOL_PREFERRED 1
1658 #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
1663 #define SHM_HUGETLB 04000
1666 #ifndef FIXED_PAGESIZE
1667 #define FIXED_PAGESIZE 4096
1670 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
1672 #if defined(_MSC_VER) && !defined(__clang__)
1673 #define CONSTRUCTOR __cdecl
1674 #define DESTRUCTOR __cdecl
1675 #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
1676 #define CONSTRUCTOR __attribute__ ((constructor))
1677 #define DESTRUCTOR __attribute__ ((destructor))
1679 #define CONSTRUCTOR __attribute__ ((constructor(101)))
1680 #define DESTRUCTOR __attribute__ ((destructor(101)))
1684 gotoblas_t *gotoblas = NULL;
1686 extern void openblas_warning(int verbose, const char * msg);
1690 #define blas_cpu_number 1
1691 #define blas_num_threads 1
1693 /* Dummy Function */
1694 int goto_get_num_procs (void) { return 1;};
1695 void goto_set_num_threads(int num_threads) {};
1699 #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
1701 int get_num_procs(void);
1703 int get_num_procs(void) {
1704 static int nums = 0;
1710 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1711 #if !defined(OS_LINUX)
1715 #if !defined(__GLIBC_PREREQ)
1718 #if !__GLIBC_PREREQ(2, 3)
1722 #if !__GLIBC_PREREQ(2, 7)
1723 ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
1724 if (ret!=0) return nums;
1726 #if !__GLIBC_PREREQ(2, 6)
1727 for (i=0;i<nums;i++)
1728 if (CPU_ISSET(i,cpusetp)) n++;
1731 nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
1735 cpusetp = CPU_ALLOC(nums);
1736 if (cpusetp == NULL) return nums;
1737 size = CPU_ALLOC_SIZE(nums);
1738 ret = sched_getaffinity(0,size,cpusetp);
1739 if (ret!=0) return nums;
1740 nums = CPU_COUNT_S(size,cpusetp);
1750 int get_num_procs(void) {
1751 static int nums = 0;
1752 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1758 int get_num_procs(void) {
1759 static int nums = 0;
1760 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1766 int get_num_procs(void) {
1767 static int nums = 0;
1768 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1775 int get_num_procs(void) {
1777 static int nums = 0;
1781 SYSTEM_INFO sysinfo;
1783 GetSystemInfo(&sysinfo);
1785 nums = sysinfo.dwNumberOfProcessors;
1793 #if defined(OS_FREEBSD)
1795 int get_num_procs(void) {
1797 static int nums = 0;
1806 sysctl(m, 2, &nums, &len, NULL, 0);
1814 #if defined(OS_DARWIN)
1815 int get_num_procs(void) {
1816 static int nums = 0;
1820 sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
1825 void set_stack_limit(int limitMB){
1830 StackSize=limitMB*1024*1024;
1831 result=getrlimit(RLIMIT_STACK, &rl);
1833 if(rl.rlim_cur < StackSize){
1834 rl.rlim_cur=StackSize;
1835 result=setrlimit(RLIMIT_STACK, &rl);
1837 fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
1847 OpenBLAS uses the numbers of CPU cores in multithreading.
1848 It can be set by openblas_set_num_threads(int num_threads);
1850 int blas_cpu_number = 0;
1852 The numbers of threads in the thread pool.
1853 This value is equal or large than blas_cpu_number. This means some threads are sleep.
1855 int blas_num_threads = 0;
1857 int goto_get_num_procs (void) {
1858 return blas_cpu_number;
1861 void openblas_fork_handler()
1863 // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
1864 // built with "make USE_OPENMP=0".
1865 // Hanging can still happen when OpenBLAS is built against the libgomp
1866 // implementation of OpenMP. The problem is tracked at:
1867 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
1868 // In the mean time build with USE_OPENMP=0 or link against another
1869 // implementation of OpenMP.
1870 #if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
1872 err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
1874 openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
1878 extern int openblas_num_threads_env();
1879 extern int openblas_goto_num_threads_env();
1880 extern int openblas_omp_num_threads_env();
1882 int blas_get_cpu_number(void){
1883 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
1886 int blas_goto_num = 0;
1887 int blas_omp_num = 0;
1889 if (blas_num_threads) return blas_num_threads;
1891 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
1892 max_num = get_num_procs();
1897 blas_goto_num=openblas_num_threads_env();
1898 if (blas_goto_num < 0) blas_goto_num = 0;
1900 if (blas_goto_num == 0) {
1901 blas_goto_num=openblas_goto_num_threads_env();
1902 if (blas_goto_num < 0) blas_goto_num = 0;
1908 blas_omp_num=openblas_omp_num_threads_env();
1909 if (blas_omp_num < 0) blas_omp_num = 0;
1911 if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
1912 else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
1913 else blas_num_threads = MAX_CPU_NUMBER;
1915 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
1916 if (blas_num_threads > max_num) blas_num_threads = max_num;
1919 if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
1922 printf( "Adjusted number of threads : %3d\n", blas_num_threads);
1925 blas_cpu_number = blas_num_threads;
1927 return blas_num_threads;
1932 int openblas_get_num_procs(void) {
1936 return get_num_procs();
1940 int openblas_get_num_threads(void) {
1944 // init blas_cpu_number if needed
1945 blas_get_cpu_number();
1946 return blas_cpu_number;
1952 void (*func)(struct release_t *);
1956 int hugetlb_allocated = 0;
1958 static struct release_t release_info[NUM_BUFFERS];
1959 static int release_pos = 0;
1961 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1962 static int hot_alloc = 0;
1965 /* Global lock for memory allocation */
1967 #if defined(USE_PTHREAD_LOCK)
1968 static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
1969 #elif defined(USE_PTHREAD_SPINLOCK)
1970 static pthread_spinlock_t alloc_lock = 0;
1972 static BLASULONG alloc_lock = 0UL;
1977 static void alloc_mmap_free(struct release_t *release){
1979 if (munmap(release -> address, BUFFER_SIZE)) {
1980 printf("OpenBLAS : munmap failed\n");
1988 static void *alloc_mmap(void *address){
1992 map_address = mmap(address,
1994 MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
1996 map_address = mmap(address,
1998 MMAP_ACCESS, MMAP_POLICY, -1, 0);
2001 if (map_address != (void *)-1) {
2002 LOCK_COMMAND(&alloc_lock);
2003 release_info[release_pos].address = map_address;
2004 release_info[release_pos].func = alloc_mmap_free;
2006 UNLOCK_COMMAND(&alloc_lock);
2010 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2018 #define BENCH_ITERATION 4
2021 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
2023 BLASULONG original, *p;
2024 BLASULONG start, stop, min;
2027 min = (BLASULONG)-1;
2029 original = *(BLASULONG *)(address + size - PAGESIZE);
2031 *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
2033 for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
2035 p = (BLASULONG *)address;
2037 count = size / PAGESIZE;
2041 for (i = 0; i < count; i ++) {
2042 p = (BLASULONG *)(*p);
2047 if (min > stop - start) min = stop - start;
2050 *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
2051 *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
2056 static void *alloc_mmap(void *address){
2057 void *map_address, *best_address;
2058 BLASULONG best, start, current;
2059 BLASULONG allocsize;
2062 /* Just give up use advanced operation */
2063 map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
2066 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2070 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2071 if (hot_alloc == 0) {
2072 map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
2075 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2081 map_address = mmap(NULL, BUFFER_SIZE * SCALING,
2082 MMAP_ACCESS, MMAP_POLICY, -1, 0);
2084 if (map_address != (void *)-1) {
2089 ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
2092 perror("OpenBLAS alloc_mmap:");
2093 printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
2097 my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
2102 allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
2104 start = (BLASULONG)map_address;
2105 current = (SCALING - 1) * BUFFER_SIZE;
2107 while(current > 0) {
2108 *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
2110 current -= PAGESIZE;
2113 *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
2115 start = (BLASULONG)map_address;
2117 best = (BLASULONG)-1;
2118 best_address = map_address;
2120 while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
2122 current = run_bench(start, allocsize);
2124 if (best > current) {
2126 best_address = (void *)start;
2133 if ((BLASULONG)best_address > (BLASULONG)map_address)
2134 munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
2136 munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
2138 map_address = best_address;
2140 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2145 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2148 LOCK_COMMAND(&alloc_lock);
2150 if (map_address != (void *)-1) {
2151 release_info[release_pos].address = map_address;
2152 release_info[release_pos].func = alloc_mmap_free;
2155 UNLOCK_COMMAND(&alloc_lock);
2167 static void alloc_malloc_free(struct release_t *release){
2169 free(release -> address);
2173 static void *alloc_malloc(void *address){
2177 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
2179 if (map_address == (void *)NULL) map_address = (void *)-1;
2181 if (map_address != (void *)-1) {
2182 release_info[release_pos].address = map_address;
2183 release_info[release_pos].func = alloc_malloc_free;
2195 void *qalloc(int flags, size_t bytes);
2196 void *qfree (void *address);
2198 #define QNONCACHE 0x1
2202 static void alloc_qalloc_free(struct release_t *release){
2204 qfree(release -> address);
2208 static void *alloc_qalloc(void *address){
2211 map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
2213 if (map_address == (void *)NULL) map_address = (void *)-1;
2215 if (map_address != (void *)-1) {
2216 release_info[release_pos].address = map_address;
2217 release_info[release_pos].func = alloc_qalloc_free;
2221 return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
2226 #ifdef ALLOC_WINDOWS
2228 static void alloc_windows_free(struct release_t *release){
2230 VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
2234 static void *alloc_windows(void *address){
2237 map_address = VirtualAlloc(address,
2239 MEM_RESERVE | MEM_COMMIT,
2242 if (map_address == (void *)NULL) map_address = (void *)-1;
2244 if (map_address != (void *)-1) {
2245 release_info[release_pos].address = map_address;
2246 release_info[release_pos].func = alloc_windows_free;
2255 #ifdef ALLOC_DEVICEDRIVER
2256 #ifndef DEVICEDRIVER_NAME
2257 #define DEVICEDRIVER_NAME "/dev/mapper"
2260 static void alloc_devicedirver_free(struct release_t *release){
2262 if (munmap(release -> address, BUFFER_SIZE)) {
2263 printf("OpenBLAS : Bugphysarea unmap failed.\n");
2266 if (close(release -> attr)) {
2267 printf("OpenBLAS : Bugphysarea close failed.\n");
2272 static void *alloc_devicedirver(void *address){
2277 if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
2283 map_address = mmap(address, BUFFER_SIZE,
2284 PROT_READ | PROT_WRITE,
2285 MAP_FILE | MAP_SHARED,
2288 if (map_address != (void *)-1) {
2289 release_info[release_pos].address = map_address;
2290 release_info[release_pos].attr = fd;
2291 release_info[release_pos].func = alloc_devicedirver_free;
2302 static void alloc_shm_free(struct release_t *release){
2304 if (shmdt(release -> address)) {
2305 printf("OpenBLAS : Shared memory unmap failed.\n");
2309 static void *alloc_shm(void *address){
2313 shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
2315 map_address = (void *)shmat(shmid, address, 0);
2317 if (map_address != (void *)-1){
2320 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2323 shmctl(shmid, IPC_RMID, 0);
2325 release_info[release_pos].address = map_address;
2326 release_info[release_pos].attr = shmid;
2327 release_info[release_pos].func = alloc_shm_free;
2334 #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
2336 static void alloc_hugetlb_free(struct release_t *release){
2338 #if defined(OS_LINUX) || defined(OS_AIX)
2339 if (shmdt(release -> address)) {
2340 printf("OpenBLAS : Hugepage unmap failed.\n");
2346 munmap(release -> address, BUFFER_SIZE);
2352 VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
2358 static void *alloc_hugetlb(void *address){
2360 void *map_address = (void *)-1;
2362 #if defined(OS_LINUX) || defined(OS_AIX)
2365 shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
2370 SHM_LGPAGE | SHM_PIN |
2372 IPC_CREAT | SHM_R | SHM_W);
2375 map_address = (void *)shmat(shmid, address, SHM_RND);
2378 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2381 if (map_address != (void *)-1){
2382 shmctl(shmid, IPC_RMID, 0);
2388 struct memcntl_mha mha;
2390 mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
2392 mha.mha_pagesize = HUGE_PAGESIZE;
2393 memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
2395 map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
2401 TOKEN_PRIVILEGES tp;
2403 if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
2405 tp.PrivilegeCount = 1;
2406 tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
2408 if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
2409 CloseHandle(hToken);
2413 if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
2414 CloseHandle(hToken);
2418 map_address = (void *)VirtualAlloc(address,
2420 MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
2423 tp.Privileges[0].Attributes = 0;
2424 AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
2426 if (map_address == (void *)NULL) map_address = (void *)-1;
2430 if (map_address != (void *)-1){
2431 release_info[release_pos].address = map_address;
2432 release_info[release_pos].func = alloc_hugetlb_free;
2442 #ifdef ALLOC_HUGETLBFILE
2444 static int hugetlb_pid = 0;
2446 static void alloc_hugetlbfile_free(struct release_t *release){
2448 if (munmap(release -> address, BUFFER_SIZE)) {
2449 printf("OpenBLAS : HugeTLBfs unmap failed.\n");
2452 if (close(release -> attr)) {
2453 printf("OpenBLAS : HugeTLBfs close failed.\n");
2457 static void *alloc_hugetlbfile(void *address){
2459 void *map_address = (void *)-1;
2463 if (!hugetlb_pid) hugetlb_pid = getpid();
2465 sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
2467 if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
2473 map_address = mmap(address, BUFFER_SIZE,
2474 PROT_READ | PROT_WRITE,
2478 if (map_address != (void *)-1) {
2479 release_info[release_pos].address = map_address;
2480 release_info[release_pos].attr = fd;
2481 release_info[release_pos].func = alloc_hugetlbfile_free;
2491 static BLASULONG base_address = 0UL;
2493 static BLASULONG base_address = BASE_ADDRESS;
2496 static volatile struct {
2499 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2509 } memory[NUM_BUFFERS];
2511 static int memory_initialized = 0;
2513 /* Memory allocation routine */
2514 /* procpos ... indicates where it comes from */
2515 /* 0 : Level 3 functions */
2516 /* 1 : Level 2 functions */
2519 void *blas_memory_alloc(int procpos){
2522 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2528 void *(*memoryalloc[])(void *address) = {
2529 #ifdef ALLOC_DEVICEDRIVER
2532 /* Hugetlb implicitly assumes ALLOC_SHM */
2536 #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
2545 #ifdef ALLOC_WINDOWS
2553 void *(**func)(void *address);
2554 LOCK_COMMAND(&alloc_lock);
2556 if (!memory_initialized) {
2558 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2559 for (position = 0; position < NUM_BUFFERS; position ++){
2560 memory[position].addr = (void *)0;
2561 memory[position].pos = -1;
2562 memory[position].used = 0;
2563 memory[position].lock = 0;
2568 gotoblas_dynamic_init();
2571 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
2572 gotoblas_affinity_init();
2576 if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
2579 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
2580 #ifndef DYNAMIC_ARCH
2581 blas_set_parameter();
2585 memory_initialized = 1;
2588 UNLOCK_COMMAND(&alloc_lock);
2591 printf("Alloc Start ...\n");
2594 /* #if defined(WHEREAMI) && !defined(USE_OPENMP)
2599 while (position >= NUM_BUFFERS) position >>= 1;
2602 if (!memory[position].used && (memory[position].pos == mypos)) {
2603 LOCK_COMMAND(&alloc_lock);
2604 // blas_lock(&memory[position].lock);
2606 if (!memory[position].used) goto allocation;
2608 UNLOCK_COMMAND(&alloc_lock);
2609 // blas_unlock(&memory[position].lock);
2614 } while (position < NUM_BUFFERS);
2621 LOCK_COMMAND(&alloc_lock);
2623 /* if (!memory[position].used) { */
2624 /* blas_lock(&memory[position].lock);*/
2626 if (!memory[position].used) goto allocation;
2628 /* blas_unlock(&memory[position].lock);*/
2633 } while (position < NUM_BUFFERS);
2634 UNLOCK_COMMAND(&alloc_lock);
2641 printf(" Position -> %d\n", position);
2644 memory[position].used = 1;
2646 UNLOCK_COMMAND(&alloc_lock);
2647 /* blas_unlock(&memory[position].lock);*/
2649 if (!memory[position].addr) {
2652 printf("Allocation Start : %lx\n", base_address);
2655 map_address = (void *)-1;
2657 func = &memoryalloc[0];
2659 while ((func != NULL) && (map_address == (void *) -1)) {
2661 map_address = (*func)((void *)base_address);
2663 #ifdef ALLOC_DEVICEDRIVER
2664 if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
2665 fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
2669 #ifdef ALLOC_HUGETLBFILE
2670 if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
2672 fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
2677 #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
2678 if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
2685 printf(" Success -> %08lx\n", map_address);
2687 if (((BLASLONG) map_address) == -1) base_address = 0UL;
2689 if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
2691 } while ((BLASLONG)map_address == -1);
2693 LOCK_COMMAND(&alloc_lock);
2694 memory[position].addr = map_address;
2695 UNLOCK_COMMAND(&alloc_lock);
2698 printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
2702 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2704 if (memory[position].pos == -1) memory[position].pos = mypos;
2710 if (memory_initialized == 1) {
2712 LOCK_COMMAND(&alloc_lock);
2714 if (memory_initialized == 1) {
2716 if (!gotoblas) gotoblas_dynamic_init();
2718 memory_initialized = 2;
2721 UNLOCK_COMMAND(&alloc_lock);
2728 printf("Mapped : %p %3d\n\n",
2729 (void *)memory[position].addr, position);
2732 return (void *)memory[position].addr;
2735 printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
2740 void blas_memory_free(void *free_area){
2745 printf("Unmapped Start : %p ...\n", free_area);
2749 LOCK_COMMAND(&alloc_lock);
2751 while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
2754 if (memory[position].addr != free_area) goto error;
2757 printf(" Position : %d\n", position);
2760 // arm: ensure all writes are finished before other thread takes this memory
2763 memory[position].used = 0;
2764 UNLOCK_COMMAND(&alloc_lock);
2767 printf("Unmap Succeeded.\n\n");
2773 printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
2776 for (position = 0; position < NUM_BUFFERS; position++)
2777 printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
2779 UNLOCK_COMMAND(&alloc_lock);
2784 void *blas_memory_alloc_nolock(int unused) {
2786 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
2790 void blas_memory_free_nolock(void * map_address) {
2794 void blas_shutdown(void){
2799 BLASFUNC(blas_thread_shutdown)();
2802 LOCK_COMMAND(&alloc_lock);
2804 for (pos = 0; pos < release_pos; pos ++) {
2805 release_info[pos].func(&release_info[pos]);
2811 base_address = BASE_ADDRESS;
2814 for (pos = 0; pos < NUM_BUFFERS; pos ++){
2815 memory[pos].addr = (void *)0;
2816 memory[pos].used = 0;
2817 #if defined(WHEREAMI) && !defined(USE_OPENMP)
2818 memory[pos].pos = -1;
2820 memory[pos].lock = 0;
2823 UNLOCK_COMMAND(&alloc_lock);
2828 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2831 #if defined(USE_PTHREAD_LOCK)
2832 static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
2833 #elif defined(USE_PTHREAD_SPINLOCK)
2834 static pthread_spinlock_t init_lock = 0;
2836 static BLASULONG init_lock = 0UL;
2840 static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
2841 void *sa, void *sb, BLASLONG pos) {
2843 #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
2848 size = BUFFER_SIZE - PAGESIZE;
2849 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
2851 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2852 if (hot_alloc != 2) {
2856 LOCK_COMMAND(&init_lock);
2860 *(int *)buffer = size;
2866 UNLOCK_COMMAND(&init_lock);
2869 size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
2870 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
2873 *(int *)buffer = size;
2878 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2887 static void _init_thread_memory(void *buffer) {
2889 blas_queue_t queue[MAX_CPU_NUMBER];
2892 for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
2894 blas_queue_init(&queue[num_cpu]);
2895 queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
2896 queue[num_cpu].routine = &_touch_memory;
2897 queue[num_cpu].args = NULL;
2898 queue[num_cpu].next = &queue[num_cpu + 1];
2901 queue[num_cpu - 1].next = NULL;
2902 queue[0].sa = buffer;
2904 exec_blas(num_cpu, queue);
2909 static void gotoblas_memory_init(void) {
2915 buffer = (void *)blas_memory_alloc(0);
2918 if (blas_cpu_number == 0) blas_get_cpu_number();
2920 if (blas_server_avail == 0) blas_thread_init();
2923 _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
2927 _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
2931 blas_memory_free(buffer);
2935 /* Initialization for all function; this function should be called before main */
2937 static int gotoblas_initialized = 0;
2938 extern void openblas_read_env();
2940 void CONSTRUCTOR gotoblas_init(void) {
2942 if (gotoblas_initialized) return;
2945 openblas_fork_handler();
2948 openblas_read_env();
2955 gotoblas_dynamic_init();
2958 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
2959 gotoblas_affinity_init();
2962 #if defined(OS_LINUX) && !defined(NO_WARMUP)
2963 gotoblas_memory_init();
2966 //#if defined(OS_LINUX)
2968 struct rlimit curlimit;
2969 if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
2971 if ( curlimit.rlim_cur != curlimit.rlim_max )
2973 curlimit.rlim_cur = curlimit.rlim_max;
2974 setrlimit(RLIMIT_STACK, &curlimit);
2980 if (blas_cpu_number == 0) blas_get_cpu_number();
2982 if (blas_server_avail == 0) blas_thread_init();
2986 #ifdef FUNCTION_PROFILE
2987 gotoblas_profile_init();
2990 gotoblas_initialized = 1;
2998 void DESTRUCTOR gotoblas_quit(void) {
3000 if (gotoblas_initialized == 0) return;
3008 #ifdef FUNCTION_PROFILE
3009 gotoblas_profile_quit();
3012 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
3013 gotoblas_affinity_quit();
3017 gotoblas_dynamic_quit();
3020 gotoblas_initialized = 0;
3027 #if defined(_MSC_VER) && !defined(__clang__)
3028 BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
3030 switch (ul_reason_for_call)
3032 case DLL_PROCESS_ATTACH:
3035 case DLL_THREAD_ATTACH:
3037 case DLL_THREAD_DETACH:
3039 case DLL_PROCESS_DETACH:
3049 This is to allow static linking.
3050 Code adapted from Google performance tools:
3051 https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
3053 https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
3054 http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
3056 static int on_process_term(void)
3062 #pragma comment(linker, "/INCLUDE:_tls_used")
3064 #pragma comment(linker, "/INCLUDE:__tls_used")
3068 #pragma const_seg(".CRT$XLB")
3070 #pragma data_seg(".CRT$XLB")
3072 static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
3080 #pragma const_seg(".CRT$XTU")
3082 #pragma data_seg(".CRT$XTU")
3084 static int(*p_process_term)(void) = on_process_term;
3092 #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
3093 /* Don't call me; this is just work around for PGI / Sun bug */
3094 void gotoblas_dummy_for_PGI(void) {
3100 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
3101 asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
3103 asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
3104 asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");