1 /*****************************************************************************
2 Copyright (c) 2011-2014, The OpenBLAS Project
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
9 1. Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in
14 the documentation and/or other materials provided with the
16 3. Neither the name of the OpenBLAS project nor the names of
17 its contributors may be used to endorse or promote products
18 derived from this software without specific prior written
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 **********************************************************************************/
34 /*********************************************************************/
35 /* Copyright 2009, 2010 The University of Texas at Austin. */
36 /* All rights reserved. */
38 /* Redistribution and use in source and binary forms, with or */
39 /* without modification, are permitted provided that the following */
40 /* conditions are met: */
42 /* 1. Redistributions of source code must retain the above */
43 /* copyright notice, this list of conditions and the following */
46 /* 2. Redistributions in binary form must reproduce the above */
47 /* copyright notice, this list of conditions and the following */
48 /* disclaimer in the documentation and/or other materials */
49 /* provided with the distribution. */
51 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
52 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
53 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
54 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
55 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
56 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
57 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
58 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
59 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
60 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
61 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
62 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
63 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
64 /* POSSIBILITY OF SUCH DAMAGE. */
66 /* The views and conclusions contained in the software and */
67 /* documentation are those of the authors and should not be */
68 /* interpreted as representing official policies, either expressed */
69 /* or implied, of The University of Texas at Austin. */
70 /*********************************************************************/
79 #ifndef MEM_LARGE_PAGES
80 #define MEM_LARGE_PAGES 0x20000000
99 #include <sys/types.h>
102 #include <sys/sysinfo.h>
105 #include <linux/unistd.h>
106 #include <sys/syscall.h>
107 #include <sys/time.h>
108 #include <sys/resource.h>
111 #if defined(OS_FREEBSD) || defined(OS_DARWIN)
112 #include <sys/sysctl.h>
113 #include <sys/resource.h>
116 #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
119 #define printf _cprintf
124 #ifndef MPOL_PREFERRED
125 #define MPOL_PREFERRED 1
130 #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
135 #define SHM_HUGETLB 04000
138 #ifndef FIXED_PAGESIZE
139 #define FIXED_PAGESIZE 4096
142 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
144 #if defined(_MSC_VER) && !defined(__clang__)
145 #define CONSTRUCTOR __cdecl
146 #define DESTRUCTOR __cdecl
147 #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
148 #define CONSTRUCTOR __attribute__ ((constructor))
149 #define DESTRUCTOR __attribute__ ((destructor))
151 #define CONSTRUCTOR __attribute__ ((constructor(101)))
152 #define DESTRUCTOR __attribute__ ((destructor(101)))
156 gotoblas_t *gotoblas = NULL;
159 extern void openblas_warning(int verbose, const char * msg);
163 #define blas_cpu_number 1
164 #define blas_num_threads 1
167 int goto_get_num_procs (void) { return 1;};
168 void goto_set_num_threads(int num_threads) {};
172 #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
174 int get_num_procs(void);
176 int get_num_procs(void) {
178 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
185 int get_num_procs(void) {
187 if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
194 int get_num_procs(void) {
202 GetSystemInfo(&sysinfo);
204 nums = sysinfo.dwNumberOfProcessors;
212 #if defined(OS_FREEBSD)
214 int get_num_procs(void) {
225 sysctl(m, 2, &nums, &len, NULL, 0);
233 #if defined(OS_DARWIN)
234 int get_num_procs(void) {
239 sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
244 void set_stack_limit(int limitMB){
249 StackSize=limitMB*1024*1024;
250 result=getrlimit(RLIMIT_STACK, &rl);
252 if(rl.rlim_cur < StackSize){
253 rl.rlim_cur=StackSize;
254 result=setrlimit(RLIMIT_STACK, &rl);
256 fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
266 OpenBLAS uses the numbers of CPU cores in multithreading.
267 It can be set by openblas_set_num_threads(int num_threads);
269 int blas_cpu_number = 0;
271 The numbers of threads in the thread pool.
272 This value is equal or large than blas_cpu_number. This means some threads are sleep.
274 int blas_num_threads = 0;
276 int goto_get_num_procs (void) {
277 return blas_cpu_number;
280 void openblas_fork_handler()
282 // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
283 // built with "make USE_OPENMP=0".
284 // Hanging can still happen when OpenBLAS is built against the libgomp
285 // implementation of OpenMP. The problem is tracked at:
286 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
287 // In the mean time build with USE_OPENMP=0 or link against another
288 // implementation of OpenMP.
289 #if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
291 err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
293 openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
297 extern int openblas_num_threads_env();
298 extern int openblas_goto_num_threads_env();
299 extern int openblas_omp_num_threads_env();
301 int blas_get_cpu_number(void){
302 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
305 int blas_goto_num = 0;
306 int blas_omp_num = 0;
308 if (blas_num_threads) return blas_num_threads;
310 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
311 max_num = get_num_procs();
316 blas_goto_num=openblas_num_threads_env();
317 if (blas_goto_num < 0) blas_goto_num = 0;
319 if (blas_goto_num == 0) {
320 blas_goto_num=openblas_goto_num_threads_env();
321 if (blas_goto_num < 0) blas_goto_num = 0;
327 blas_omp_num=openblas_omp_num_threads_env();
328 if (blas_omp_num < 0) blas_omp_num = 0;
330 if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
331 else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
332 else blas_num_threads = MAX_CPU_NUMBER;
334 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
335 if (blas_num_threads > max_num) blas_num_threads = max_num;
338 if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
341 printf( "Adjusted number of threads : %3d\n", blas_num_threads);
344 blas_cpu_number = blas_num_threads;
346 return blas_num_threads;
351 int openblas_get_num_procs(void) {
355 return get_num_procs();
359 int openblas_get_num_threads(void) {
363 // init blas_cpu_number if needed
364 blas_get_cpu_number();
365 return blas_cpu_number;
371 void (*func)(struct release_t *);
375 int hugetlb_allocated = 0;
377 static struct release_t release_info[NUM_BUFFERS];
378 static int release_pos = 0;
380 #if defined(OS_LINUX) && !defined(NO_WARMUP)
381 static int hot_alloc = 0;
384 /* Global lock for memory allocation */
386 #if defined(USE_PTHREAD_LOCK)
387 static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
388 #elif defined(USE_PTHREAD_SPINLOCK)
389 static pthread_spinlock_t alloc_lock = 0;
391 static BLASULONG alloc_lock = 0UL;
396 static void alloc_mmap_free(struct release_t *release){
398 if (munmap(release -> address, BUFFER_SIZE)) {
399 printf("OpenBLAS : munmap failed\n");
407 static void *alloc_mmap(void *address){
411 map_address = mmap(address,
413 MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
415 map_address = mmap(address,
417 MMAP_ACCESS, MMAP_POLICY, -1, 0);
420 if (map_address != (void *)-1) {
421 LOCK_COMMAND(&alloc_lock);
422 release_info[release_pos].address = map_address;
423 release_info[release_pos].func = alloc_mmap_free;
425 UNLOCK_COMMAND(&alloc_lock);
429 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
437 #define BENCH_ITERATION 4
440 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
442 BLASULONG original, *p;
443 BLASULONG start, stop, min;
448 original = *(BLASULONG *)(address + size - PAGESIZE);
450 *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
452 for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
454 p = (BLASULONG *)address;
456 count = size / PAGESIZE;
460 for (i = 0; i < count; i ++) {
461 p = (BLASULONG *)(*p);
466 if (min > stop - start) min = stop - start;
469 *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
470 *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
475 static void *alloc_mmap(void *address){
476 void *map_address, *best_address;
477 BLASULONG best, start, current;
481 /* Just give up use advanced operation */
482 map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
485 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
489 #if defined(OS_LINUX) && !defined(NO_WARMUP)
490 if (hot_alloc == 0) {
491 map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
494 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
500 map_address = mmap(NULL, BUFFER_SIZE * SCALING,
501 MMAP_ACCESS, MMAP_POLICY, -1, 0);
503 if (map_address != (void *)-1) {
508 ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
511 perror("OpenBLAS alloc_mmap:");
512 printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
516 my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
521 allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
523 start = (BLASULONG)map_address;
524 current = (SCALING - 1) * BUFFER_SIZE;
527 *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
532 *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
534 start = (BLASULONG)map_address;
536 best = (BLASULONG)-1;
537 best_address = map_address;
539 while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
541 current = run_bench(start, allocsize);
543 if (best > current) {
545 best_address = (void *)start;
552 if ((BLASULONG)best_address > (BLASULONG)map_address)
553 munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
555 munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
557 map_address = best_address;
559 #if defined(OS_LINUX) && !defined(NO_WARMUP)
564 #if defined(OS_LINUX) && !defined(NO_WARMUP)
567 LOCK_COMMAND(&alloc_lock);
569 if (map_address != (void *)-1) {
570 release_info[release_pos].address = map_address;
571 release_info[release_pos].func = alloc_mmap_free;
574 UNLOCK_COMMAND(&alloc_lock);
586 static void alloc_malloc_free(struct release_t *release){
588 free(release -> address);
592 static void *alloc_malloc(void *address){
596 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
598 if (map_address == (void *)NULL) map_address = (void *)-1;
600 if (map_address != (void *)-1) {
601 release_info[release_pos].address = map_address;
602 release_info[release_pos].func = alloc_malloc_free;
614 void *qalloc(int flags, size_t bytes);
615 void *qfree (void *address);
617 #define QNONCACHE 0x1
621 static void alloc_qalloc_free(struct release_t *release){
623 qfree(release -> address);
627 static void *alloc_qalloc(void *address){
630 map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
632 if (map_address == (void *)NULL) map_address = (void *)-1;
634 if (map_address != (void *)-1) {
635 release_info[release_pos].address = map_address;
636 release_info[release_pos].func = alloc_qalloc_free;
640 return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
647 static void alloc_windows_free(struct release_t *release){
649 VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
653 static void *alloc_windows(void *address){
656 map_address = VirtualAlloc(address,
658 MEM_RESERVE | MEM_COMMIT,
661 if (map_address == (void *)NULL) map_address = (void *)-1;
663 if (map_address != (void *)-1) {
664 release_info[release_pos].address = map_address;
665 release_info[release_pos].func = alloc_windows_free;
674 #ifdef ALLOC_DEVICEDRIVER
675 #ifndef DEVICEDRIVER_NAME
676 #define DEVICEDRIVER_NAME "/dev/mapper"
679 static void alloc_devicedirver_free(struct release_t *release){
681 if (munmap(release -> address, BUFFER_SIZE)) {
682 printf("OpenBLAS : Bugphysarea unmap failed.\n");
685 if (close(release -> attr)) {
686 printf("OpenBLAS : Bugphysarea close failed.\n");
691 static void *alloc_devicedirver(void *address){
696 if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
702 map_address = mmap(address, BUFFER_SIZE,
703 PROT_READ | PROT_WRITE,
704 MAP_FILE | MAP_SHARED,
707 if (map_address != (void *)-1) {
708 release_info[release_pos].address = map_address;
709 release_info[release_pos].attr = fd;
710 release_info[release_pos].func = alloc_devicedirver_free;
721 static void alloc_shm_free(struct release_t *release){
723 if (shmdt(release -> address)) {
724 printf("OpenBLAS : Shared memory unmap failed.\n");
728 static void *alloc_shm(void *address){
732 shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
734 map_address = (void *)shmat(shmid, address, 0);
736 if (map_address != (void *)-1){
739 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
742 shmctl(shmid, IPC_RMID, 0);
744 release_info[release_pos].address = map_address;
745 release_info[release_pos].attr = shmid;
746 release_info[release_pos].func = alloc_shm_free;
753 #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
755 static void alloc_hugetlb_free(struct release_t *release){
757 #if defined(OS_LINUX) || defined(OS_AIX)
758 if (shmdt(release -> address)) {
759 printf("OpenBLAS : Hugepage unmap failed.\n");
765 munmap(release -> address, BUFFER_SIZE);
771 VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
777 static void *alloc_hugetlb(void *address){
779 void *map_address = (void *)-1;
781 #if defined(OS_LINUX) || defined(OS_AIX)
784 shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
789 SHM_LGPAGE | SHM_PIN |
791 IPC_CREAT | SHM_R | SHM_W);
794 map_address = (void *)shmat(shmid, address, SHM_RND);
797 my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
800 if (map_address != (void *)-1){
801 shmctl(shmid, IPC_RMID, 0);
807 struct memcntl_mha mha;
809 mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
811 mha.mha_pagesize = HUGE_PAGESIZE;
812 memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
814 map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
822 if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
824 tp.PrivilegeCount = 1;
825 tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
827 if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
832 if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
837 map_address = (void *)VirtualAlloc(address,
839 MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
842 tp.Privileges[0].Attributes = 0;
843 AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
845 if (map_address == (void *)NULL) map_address = (void *)-1;
849 if (map_address != (void *)-1){
850 release_info[release_pos].address = map_address;
851 release_info[release_pos].func = alloc_hugetlb_free;
861 #ifdef ALLOC_HUGETLBFILE
863 static int hugetlb_pid = 0;
865 static void alloc_hugetlbfile_free(struct release_t *release){
867 if (munmap(release -> address, BUFFER_SIZE)) {
868 printf("OpenBLAS : HugeTLBfs unmap failed.\n");
871 if (close(release -> attr)) {
872 printf("OpenBLAS : HugeTLBfs close failed.\n");
876 static void *alloc_hugetlbfile(void *address){
878 void *map_address = (void *)-1;
882 if (!hugetlb_pid) hugetlb_pid = getpid();
884 sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
886 if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
892 map_address = mmap(address, BUFFER_SIZE,
893 PROT_READ | PROT_WRITE,
897 if (map_address != (void *)-1) {
898 release_info[release_pos].address = map_address;
899 release_info[release_pos].attr = fd;
900 release_info[release_pos].func = alloc_hugetlbfile_free;
910 static BLASULONG base_address = 0UL;
912 static BLASULONG base_address = BASE_ADDRESS;
915 static volatile struct {
918 #if defined(WHEREAMI) && !defined(USE_OPENMP)
928 } memory[NUM_BUFFERS];
930 static int memory_initialized = 0;
932 /* Memory allocation routine */
933 /* procpos ... indicates where it comes from */
934 /* 0 : Level 3 functions */
935 /* 1 : Level 2 functions */
938 void *blas_memory_alloc(int procpos){
941 #if defined(WHEREAMI) && !defined(USE_OPENMP)
947 void *(*memoryalloc[])(void *address) = {
948 #ifdef ALLOC_DEVICEDRIVER
951 /* Hugetlb implicitly assumes ALLOC_SHM */
955 #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
972 void *(**func)(void *address);
973 LOCK_COMMAND(&alloc_lock);
975 if (!memory_initialized) {
977 #if defined(WHEREAMI) && !defined(USE_OPENMP)
978 for (position = 0; position < NUM_BUFFERS; position ++){
979 memory[position].addr = (void *)0;
980 memory[position].pos = -1;
981 memory[position].used = 0;
982 memory[position].lock = 0;
987 gotoblas_dynamic_init();
990 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
991 gotoblas_affinity_init();
995 if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
998 #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
1000 blas_set_parameter();
1004 memory_initialized = 1;
1007 UNLOCK_COMMAND(&alloc_lock);
1010 printf("Alloc Start ...\n");
1013 #if defined(WHEREAMI) && !defined(USE_OPENMP)
1018 while (position > NUM_BUFFERS) position >>= 1;
1021 if (!memory[position].used && (memory[position].pos == mypos)) {
1023 blas_lock(&memory[position].lock);
1025 if (!memory[position].used) goto allocation;
1027 blas_unlock(&memory[position].lock);
1032 } while (position < NUM_BUFFERS);
1040 /* if (!memory[position].used) { */
1042 blas_lock(&memory[position].lock);
1044 if (!memory[position].used) goto allocation;
1046 blas_unlock(&memory[position].lock);
1051 } while (position < NUM_BUFFERS);
1058 printf(" Position -> %d\n", position);
1061 memory[position].used = 1;
1063 blas_unlock(&memory[position].lock);
1065 if (!memory[position].addr) {
1068 printf("Allocation Start : %lx\n", base_address);
1071 map_address = (void *)-1;
1073 func = &memoryalloc[0];
1075 while ((func != NULL) && (map_address == (void *) -1)) {
1077 map_address = (*func)((void *)base_address);
1079 #ifdef ALLOC_DEVICEDRIVER
1080 if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
1081 fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
1085 #ifdef ALLOC_HUGETLBFILE
1086 if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
1088 fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
1093 #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
1094 if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
1101 printf(" Success -> %08lx\n", map_address);
1103 if (((BLASLONG) map_address) == -1) base_address = 0UL;
1105 if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
1107 } while ((BLASLONG)map_address == -1);
1109 LOCK_COMMAND(&alloc_lock);
1110 memory[position].addr = map_address;
1111 UNLOCK_COMMAND(&alloc_lock);
1114 printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
1118 #if defined(WHEREAMI) && !defined(USE_OPENMP)
1120 if (memory[position].pos == -1) memory[position].pos = mypos;
1126 if (memory_initialized == 1) {
1128 LOCK_COMMAND(&alloc_lock);
1130 if (memory_initialized == 1) {
1132 if (!gotoblas) gotoblas_dynamic_init();
1134 memory_initialized = 2;
1137 UNLOCK_COMMAND(&alloc_lock);
1144 printf("Mapped : %p %3d\n\n",
1145 (void *)memory[position].addr, position);
1148 return (void *)memory[position].addr;
1151 printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
1156 void blas_memory_free(void *free_area){
1161 printf("Unmapped Start : %p ...\n", free_area);
1165 LOCK_COMMAND(&alloc_lock);
1167 while ((memory[position].addr != free_area)
1168 && (position < NUM_BUFFERS)) position++;
1170 if (memory[position].addr != free_area) goto error;
1173 printf(" Position : %d\n", position);
1176 // arm: ensure all writes are finished before other thread takes this memory
1179 memory[position].used = 0;
1180 UNLOCK_COMMAND(&alloc_lock);
1183 printf("Unmap Succeeded.\n\n");
1189 printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
1192 for (position = 0; position < NUM_BUFFERS; position++)
1193 printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
1195 UNLOCK_COMMAND(&alloc_lock);
1200 void *blas_memory_alloc_nolock(int unused) {
1202 map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
1206 void blas_memory_free_nolock(void * map_address) {
1210 void blas_shutdown(void){
1215 BLASFUNC(blas_thread_shutdown)();
1218 LOCK_COMMAND(&alloc_lock);
1220 for (pos = 0; pos < release_pos; pos ++) {
1221 release_info[pos].func(&release_info[pos]);
1227 base_address = BASE_ADDRESS;
1230 for (pos = 0; pos < NUM_BUFFERS; pos ++){
1231 memory[pos].addr = (void *)0;
1232 memory[pos].used = 0;
1233 #if defined(WHEREAMI) && !defined(USE_OPENMP)
1234 memory[pos].pos = -1;
1236 memory[pos].lock = 0;
1239 UNLOCK_COMMAND(&alloc_lock);
1244 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1247 #if defined(USE_PTHREAD_LOCK)
1248 static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
1249 #elif defined(USE_PTHREAD_SPINLOCK)
1250 static pthread_spinlock_t init_lock = 0;
1252 static BLASULONG init_lock = 0UL;
1256 static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
1257 void *sa, void *sb, BLASLONG pos) {
1259 #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
1264 size = BUFFER_SIZE - PAGESIZE;
1265 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1267 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1268 if (hot_alloc != 2) {
1272 LOCK_COMMAND(&init_lock);
1276 *(int *)buffer = size;
1282 UNLOCK_COMMAND(&init_lock);
1285 size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
1286 buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1289 *(int *)buffer = size;
1294 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1303 static void _init_thread_memory(void *buffer) {
1305 blas_queue_t queue[MAX_CPU_NUMBER];
1308 for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
1310 blas_queue_init(&queue[num_cpu]);
1311 queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
1312 queue[num_cpu].routine = &_touch_memory;
1313 queue[num_cpu].args = NULL;
1314 queue[num_cpu].next = &queue[num_cpu + 1];
1317 queue[num_cpu - 1].next = NULL;
1318 queue[0].sa = buffer;
1320 exec_blas(num_cpu, queue);
1325 static void gotoblas_memory_init(void) {
1331 buffer = (void *)blas_memory_alloc(0);
1334 if (blas_cpu_number == 0) blas_get_cpu_number();
1336 if (blas_server_avail == 0) blas_thread_init();
1339 _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
1343 _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
1347 blas_memory_free(buffer);
1351 /* Initialization for all function; this function should be called before main */
1353 static int gotoblas_initialized = 0;
1354 extern void openblas_read_env();
1356 void CONSTRUCTOR gotoblas_init(void) {
1358 if (gotoblas_initialized) return;
1361 openblas_fork_handler();
1364 openblas_read_env();
1371 gotoblas_dynamic_init();
1374 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1375 gotoblas_affinity_init();
1378 #if defined(OS_LINUX) && !defined(NO_WARMUP)
1379 gotoblas_memory_init();
1382 //#if defined(OS_LINUX)
1384 struct rlimit curlimit;
1385 if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
1387 if ( curlimit.rlim_cur != curlimit.rlim_max )
1389 curlimit.rlim_cur = curlimit.rlim_max;
1390 setrlimit(RLIMIT_STACK, &curlimit);
1396 if (blas_cpu_number == 0) blas_get_cpu_number();
1398 if (blas_server_avail == 0) blas_thread_init();
1402 #ifdef FUNCTION_PROFILE
1403 gotoblas_profile_init();
1406 gotoblas_initialized = 1;
1414 void DESTRUCTOR gotoblas_quit(void) {
1416 if (gotoblas_initialized == 0) return;
1424 #ifdef FUNCTION_PROFILE
1425 gotoblas_profile_quit();
1428 #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1429 gotoblas_affinity_quit();
1433 gotoblas_dynamic_quit();
1436 gotoblas_initialized = 0;
1443 #if defined(_MSC_VER) && !defined(__clang__)
1444 BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
1446 switch (ul_reason_for_call)
1448 case DLL_PROCESS_ATTACH:
1451 case DLL_THREAD_ATTACH:
1453 case DLL_THREAD_DETACH:
1455 case DLL_PROCESS_DETACH:
1465 This is to allow static linking.
1466 Code adapted from Google performance tools:
1467 https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
1469 https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
1470 http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
1472 static int on_process_term(void)
1478 #pragma comment(linker, "/INCLUDE:_tls_used")
1480 #pragma comment(linker, "/INCLUDE:__tls_used")
1482 #pragma data_seg(push, old_seg)
1483 #pragma data_seg(".CRT$XLB")
1484 static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
1485 #pragma data_seg(".CRT$XTU")
1486 static int(*p_process_term)(void) = on_process_term;
1487 #pragma data_seg(pop, old_seg)
1490 #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
1491 /* Don't call me; this is just work around for PGI / Sun bug */
1492 void gotoblas_dummy_for_PGI(void) {
1498 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
1499 asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
1501 asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
1502 asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");