From fd42ca462d2df0eece73b26865fa55f7bfa07e53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 19:35:16 +0200 Subject: [PATCH] Combo of default pre-0.3.1 memory.c and band-aided version of PR1739 --- driver/others/memory.c | 1725 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 1606 insertions(+), 119 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb2..6bca1e1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -72,6 +72,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //#undef DEBUG #include "common.h" + +#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || __GLIBC_PREREQ(2,20)) +#warning "using tls version of memory.c" #include #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) @@ -108,6 +111,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#ifdef OS_HAIKU +#include +#endif + #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include @@ -139,14 +146,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif -#ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP -#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) -#else -#define BUFFERS_PER_THREAD NUM_BUFFERS -#endif -#endif - #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -238,6 +237,14 @@ int get_num_procs(void) { } #endif +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -363,7 +370,7 @@ int blas_get_cpu_number(void){ #endif // blas_goto_num = 0; -#ifndef USE_OPENMP +#ifndef USE_OPENMP_UNUSED blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -420,10 +427,8 @@ int openblas_get_num_threads(void) { int hugetlb_allocated = 0; #if defined(OS_WINDOWS) -#define THREAD_LOCAL __declspec(thread) #define LIKELY_ONE(x) (x) #else -#define THREAD_LOCAL __thread #define LIKELY_ONE(x) (__builtin_expect(x, 1)) #endif @@ -459,62 +464,15 @@ struct alloc_t { for an auxiliary tracking structure. */ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); -/* Clang supports TLS from version 2.8 */ -#if defined(__clang__) && __clang_major__ > 2 || \ - (__clang_minor__ == 2 || __clang_minor__ == 8) -#define HAS_COMPILER_TLS -#endif - -/* GCC supports TLS from version 4.1 */ -#if !defined(__clang__) && defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) -#define HAS_COMPILER_TLS -#endif - -/* MSVC supports TLS from version 2005 */ -#if defined(_MSC_VER) && _MSC_VER >= 1400 -#define HAS_COMPILER_TLS -#endif - -/* Versions of XCode before 8 did not properly support TLS */ -#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 -#undef HAS_COMPILER_TLS -#endif - -/* Android NDK's before version 12b did not support TLS */ -#if defined(__ANDROID__) && defined(__clang__) -#if __has_include() -#include -#endif -#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ - defined(__NDK_MINOR__) && \ - ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) -#undef HAS_COMPILER_TLS -#endif -#endif - -/* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP) -/* This is the number of threads than can be spawned by the server, which is the - server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 -static int next_memory_table_pos = 0; -# if defined(HAS_COMPILER_TLS) -/* Use compiler generated thread-local-storage */ -static int THREAD_LOCAL local_memory_table_pos = 0; +#if defined(SMP) +# if defined(OS_WINDOWS) +static DWORD local_storage_key = 0; +DWORD lsk; # else -/* Use system-dependent thread-local-storage */ -# if defined(OS_WINDOWS) -static DWORD local_storage_key; -# else -static pthread_key_t local_storage_key; -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#else -/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ -# define MAX_ALLOCATING_THREADS 1 -#endif /* defined(SMP) && !defined(USE_OPENMP) */ -static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; +static pthread_key_t local_storage_key = 0; +pthread_key_t lsk; +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -530,34 +488,54 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t key_lock = 0; +#else +static BLASULONG key_lock = 0UL; +#endif + /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP) -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); -# else - int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ - if (!local_memory_table_pos) { - LOCK_COMMAND(&alloc_lock); - local_memory_table_pos = next_memory_table_pos++; - if (next_memory_table_pos > MAX_ALLOCATING_THREADS) - printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); - UNLOCK_COMMAND(&alloc_lock); -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); -# else - pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ - } - return local_memory_table[local_memory_table_pos]; +#if defined(SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (!lsk) { + blas_memory_init(); + } +# if defined(OS_WINDOWS) + struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key); +# else + struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ +#else + static struct alloc_t ** local_memory_table = NULL; +#endif /* defined(SMP) */ +#if defined (SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (lsk && !local_memory_table) { #else - return local_memory_table[0]; -#endif /* defined(SMP) && !defined(USE_OPENMP) */ + if (!local_memory_table) { +#endif /* defined(SMP) */ + local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS); + memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS); +#if defined(SMP) +# if defined(OS_WINDOWS) +LOCK_COMMAND(&key_lock); + TlsSetValue(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# else +LOCK_COMMAND(&key_lock); + pthread_setspecific(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ + } + return local_memory_table; } #ifdef ALLOC_MMAP @@ -637,7 +615,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { static void *alloc_mmap(void *address){ void *map_address, *best_address; - BLASULONG best, start, current; + BLASULONG best, start, current, original; BLASULONG allocsize; if (address){ @@ -685,8 +663,9 @@ static void *alloc_mmap(void *address){ start = (BLASULONG)map_address; current = (SCALING - 1) * allocation_block_size; + original = current; - while(current > 0) { + while(current > 0 && current <= original) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; @@ -1056,18 +1035,29 @@ static volatile int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ + static void blas_memory_cleanup(void* ptr){ + if (ptr) { + struct alloc_t ** table = (struct alloc_t **)ptr; + int pos; + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + struct alloc_t *alloc_info = table[pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + table[pos] = (void *)0; + } + } + free(table); + } +} + static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP) - next_memory_table_pos = 0; -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - local_storage_key = ::TlsAlloc(); -# else - pthread_key_create(&local_storage_key, NULL); -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#endif /* defined(SMP) && !defined(USE_OPENMP) */ - memset(local_memory_table, 0, sizeof(local_memory_table)); +#if defined(SMP) +# if defined(OS_WINDOWS) + local_storage_key = TlsAlloc(); +# else + pthread_key_create(&local_storage_key, blas_memory_cleanup); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ } void *blas_memory_alloc(int procpos){ @@ -1105,7 +1095,16 @@ void *blas_memory_alloc(int procpos){ struct alloc_t * alloc_info; struct alloc_t ** alloc_table; + +#if defined(SMP) && !defined(USE_OPENMP) +int mi; +LOCK_COMMAND(&alloc_lock); +mi=memory_initialized; +UNLOCK_COMMAND(&alloc_lock); + if (!LIKELY_ONE(mi)) { +#else if (!LIKELY_ONE(memory_initialized)) { +#endif #if defined(SMP) && !defined(USE_OPENMP) /* Only allow a single thread to initialize memory system */ LOCK_COMMAND(&alloc_lock); @@ -1149,7 +1148,7 @@ void *blas_memory_alloc(int procpos){ if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; - } while (position < BUFFERS_PER_THREAD); + } while (position < NUM_BUFFERS); goto error; @@ -1247,7 +1246,7 @@ void blas_memory_free(void *buffer){ #ifdef DEBUG alloc_table = get_memory_table(); - for (position = 0; position < BUFFERS_PER_THREAD; position++){ + for (position = 0; position < NUM_BUFFERS; position++){ if (alloc_table[position]) { printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); } @@ -1267,22 +1266,14 @@ void blas_memory_free_nolock(void * map_address) { } void blas_shutdown(void){ - - int pos, thread; - #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - - for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - struct alloc_t *alloc_info = local_memory_table[thread][pos]; - if (alloc_info) { - alloc_info->release_func(alloc_info); - alloc_info = (void *)0; - } - } - } +#ifdef SMP + /* Only cleanupIf we were built for threading and TLS was initialized */ + if (local_storage_key) +#endif + blas_memory_cleanup((void*)get_memory_table()); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1503,6 +1494,9 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: +#if defined(SMP) + blas_memory_cleanup((void*)get_memory_table()); +#endif break; case DLL_PROCESS_DETACH: gotoblas_quit(); @@ -1573,3 +1567,1496 @@ void gotoblas_dummy_for_PGI(void) { #endif } #endif + +#else +#include + +#ifdef OS_WINDOWS +#define ALLOC_WINDOWS +#ifndef MEM_LARGE_PAGES +#define MEM_LARGE_PAGES 0x20000000 +#endif +#else +#define ALLOC_MMAP +#define ALLOC_MALLOC +#endif + +#include +#include +#include + +#ifndef OS_WINDOWS +#include +#ifndef NO_SYSV_IPC +#include +#endif +#include +#endif + +#include + +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#include +#include +#endif + +#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#include +#include +#endif + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef OS_LINUX + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#endif + +#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP) +#define NO_WARMUP +#endif + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#ifndef FIXED_PAGESIZE +#define FIXED_PAGESIZE 4096 +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#if defined(_MSC_VER) && !defined(__clang__) +#define CONSTRUCTOR __cdecl +#define DESTRUCTOR __cdecl +#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) +#else +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) +#endif + +#ifdef DYNAMIC_ARCH +gotoblas_t *gotoblas = NULL; +#endif +extern void openblas_warning(int verbose, const char * msg); + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + static int nums = 0; +cpu_set_t *cpusetp; +size_t size; +int ret; +int i,n; + + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); +#if !defined(OS_LINUX) + return nums; +#endif + +#if !defined(__GLIBC_PREREQ) + return nums; +#else + #if !__GLIBC_PREREQ(2, 3) + return nums; + #endif + + #if !__GLIBC_PREREQ(2, 7) + ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + if (ret!=0) return nums; + n=0; + #if !__GLIBC_PREREQ(2, 6) + for (i=0;i 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + +int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else + // init blas_cpu_number if needed + blas_get_cpu_number(); + return blas_cpu_number; +#endif +} + +struct release_t { + void *address; + void (*func)(struct release_t *); + long attr; +}; + +int hugetlb_allocated = 0; + +static struct release_t release_info[NUM_BUFFERS]; +static int release_pos = 0; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) +static int hot_alloc = 0; +#endif + +/* Global lock for memory allocation */ + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t alloc_lock = 0; +#else +static BLASULONG alloc_lock = 0UL; +#endif + +#ifdef ALLOC_MMAP + +static void alloc_mmap_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : munmap failed\n"); + } +} + + + +#ifdef NO_WARMUP + +static void *alloc_mmap(void *address){ + void *map_address; + + if (address){ + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + } else { + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + } + + if (map_address != (void *)-1) { + LOCK_COMMAND(&alloc_lock); + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + UNLOCK_COMMAND(&alloc_lock); + } + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + return map_address; +} + +#else + +#define BENCH_ITERATION 4 +#define SCALING 2 + +static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { + + BLASULONG original, *p; + BLASULONG start, stop, min; + int iter, i, count; + + min = (BLASULONG)-1; + + original = *(BLASULONG *)(address + size - PAGESIZE); + + *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address; + + for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { + + p = (BLASULONG *)address; + + count = size / PAGESIZE; + + start = rpcc(); + + for (i = 0; i < count; i ++) { + p = (BLASULONG *)(*p); + } + + stop = rpcc(); + + if (min > stop - start) min = stop - start; + } + + *(BLASULONG *)(address + size - PAGESIZE + 0) = original; + *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; + + return min; +} + +static void *alloc_mmap(void *address){ + void *map_address, *best_address; + BLASULONG best, start, current; + BLASULONG allocsize; + + if (address){ + /* Just give up use advanced operation */ + map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc == 0) { + map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#endif + + map_address = mmap(NULL, BUFFER_SIZE * SCALING, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + + if (map_address != (void *)-1) { + +#ifdef OS_LINUX +#ifdef DEBUG + int ret=0; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } + +#else + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif +#endif + + + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; + + while(current > 0) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } + + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + + start = (BLASULONG)map_address; + + best = (BLASULONG)-1; + best_address = map_address; + + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + + current = run_bench(start, allocsize); + + if (best > current) { + best = current; + best_address = (void *)start; + } + + start += PAGESIZE; + + } + + if ((BLASULONG)best_address > (BLASULONG)map_address) + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + + munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + + map_address = best_address; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + hot_alloc = 2; +#endif + } + } +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + LOCK_COMMAND(&alloc_lock); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + UNLOCK_COMMAND(&alloc_lock); + + return map_address; +} + +#endif + +#endif + + +#ifdef ALLOC_MALLOC + +static void alloc_malloc_free(struct release_t *release){ + + free(release -> address); + +} + +static void *alloc_malloc(void *address){ + + void *map_address; + + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_malloc_free; + release_pos ++; + } + + return map_address; + +} + +#endif + +#ifdef ALLOC_QALLOC + +void *qalloc(int flags, size_t bytes); +void *qfree (void *address); + +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 + +static void alloc_qalloc_free(struct release_t *release){ + + qfree(release -> address); + +} + +static void *alloc_qalloc(void *address){ + void *map_address; + + map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_qalloc_free; + release_pos ++; + } + + return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); +} + +#endif + +#ifdef ALLOC_WINDOWS + +static void alloc_windows_free(struct release_t *release){ + + VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + +} + +static void *alloc_windows(void *address){ + void *map_address; + + map_address = VirtualAlloc(address, + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_windows_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_DEVICEDRIVER +#ifndef DEVICEDRIVER_NAME +#define DEVICEDRIVER_NAME "/dev/mapper" +#endif + +static void alloc_devicedirver_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : Bugphysarea unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : Bugphysarea close failed.\n"); + } + +} + +static void *alloc_devicedirver(void *address){ + + int fd; + void *map_address; + + if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) { + + return (void *)-1; + + } + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_devicedirver_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_SHM + +static void alloc_shm_free(struct release_t *release){ + + if (shmdt(release -> address)) { + printf("OpenBLAS : Shared memory unmap failed.\n"); + } +} + +static void *alloc_shm(void *address){ + void *map_address; + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + + map_address = (void *)shmat(shmid, address, 0); + + if (map_address != (void *)-1){ + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + shmctl(shmid, IPC_RMID, 0); + + release_info[release_pos].address = map_address; + release_info[release_pos].attr = shmid; + release_info[release_pos].func = alloc_shm_free; + release_pos ++; + } + + return map_address; +} + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + +static void alloc_hugetlb_free(struct release_t *release){ + +#if defined(OS_LINUX) || defined(OS_AIX) + if (shmdt(release -> address)) { + printf("OpenBLAS : Hugepage unmap failed.\n"); + } +#endif + +#ifdef __sun__ + + munmap(release -> address, BUFFER_SIZE); + +#endif + +#ifdef OS_WINDOWS + + VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + +#endif + +} + +static void *alloc_hugetlb(void *address){ + + void *map_address = (void *)-1; + +#if defined(OS_LINUX) || defined(OS_AIX) + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, +#ifdef OS_LINUX + SHM_HUGETLB | +#endif +#ifdef OS_AIX + SHM_LGPAGE | SHM_PIN | +#endif + IPC_CREAT | SHM_R | SHM_W); + + if (shmid != -1) { + map_address = (void *)shmat(shmid, address, SHM_RND); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + if (map_address != (void *)-1){ + shmctl(shmid, IPC_RMID, 0); + } + } +#endif + +#ifdef __sun__ + struct memcntl_mha mha; + + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; + mha.mha_flags = 0; + mha.mha_pagesize = HUGE_PAGESIZE; + memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); + + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); +#endif + +#ifdef OS_WINDOWS + + HANDLE hToken; + TOKEN_PRIVILEGES tp; + + if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + map_address = (void *)VirtualAlloc(address, + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + tp.Privileges[0].Attributes = 0; + AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); + + if (map_address == (void *)NULL) map_address = (void *)-1; + +#endif + + if (map_address != (void *)-1){ + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_hugetlb_free; + release_pos ++; + } + + return map_address; +} +#endif + +#endif + +#ifdef ALLOC_HUGETLBFILE + +static int hugetlb_pid = 0; + +static void alloc_hugetlbfile_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : HugeTLBfs unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : HugeTLBfs close failed.\n"); + } +} + +static void *alloc_hugetlbfile(void *address){ + + void *map_address = (void *)-1; + int fd; + char filename[64]; + + if (!hugetlb_pid) hugetlb_pid = getpid(); + + sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid); + + if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) { + return (void *)-1; + } + + unlink(filename); + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_hugetlbfile_free; + release_pos ++; + } + + return map_address; +} +#endif + + +#ifdef SEEK_ADDRESS +static BLASULONG base_address = 0UL; +#else +static BLASULONG base_address = BASE_ADDRESS; +#endif + +static volatile struct { + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif + +} memory[NUM_BUFFERS]; + +static int memory_initialized = 0; + +/* Memory allocation routine */ +/* procpos ... indicates where it comes from */ +/* 0 : Level 3 functions */ +/* 1 : Level 2 functions */ +/* 2 : Thread */ + +void *blas_memory_alloc(int procpos){ + + int position; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int mypos; +#endif + + void *map_address; + + void *(*memoryalloc[])(void *address) = { +#ifdef ALLOC_DEVICEDRIVER + alloc_devicedirver, +#endif +/* Hugetlb implicitly assumes ALLOC_SHM */ +#ifdef ALLOC_SHM + alloc_shm, +#endif +#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) + alloc_hugetlb, +#endif +#ifdef ALLOC_MMAP + alloc_mmap, +#endif +#ifdef ALLOC_QALLOC + alloc_qalloc, +#endif +#ifdef ALLOC_WINDOWS + alloc_windows, +#endif +#ifdef ALLOC_MALLOC + alloc_malloc, +#endif + NULL, + }; + void *(**func)(void *address); + LOCK_COMMAND(&alloc_lock); + + if (!memory_initialized) { + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + for (position = 0; position < NUM_BUFFERS; position ++){ + memory[position].addr = (void *)0; + memory[position].pos = -1; + memory[position].used = 0; + memory[position].lock = 0; + } +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#ifdef SMP + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#ifndef DYNAMIC_ARCH + blas_set_parameter(); +#endif +#endif + + memory_initialized = 1; + + } + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Alloc Start ...\n"); +#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + mypos = WhereAmI(); + + position = mypos; + while (position >= NUM_BUFFERS) position >>= 1; + + do { + if (!memory[position].used && (memory[position].pos == mypos)) { + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + } + + position ++; + + } while (position < NUM_BUFFERS); + + +#endif + + position = 0; + + do { +/* if (!memory[position].used) { */ + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ +/* } */ + + position ++; + + } while (position < NUM_BUFFERS); + + goto error; + + allocation : + +#ifdef DEBUG + printf(" Position -> %d\n", position); +#endif + + memory[position].used = 1; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + + if (!memory[position].addr) { + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + + LOCK_COMMAND(&alloc_lock); + memory[position].addr = map_address; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); +#endif + } + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (memory[position].pos == -1) memory[position].pos = mypos; + +#endif + +#ifdef DYNAMIC_ARCH + + if (memory_initialized == 1) { + + LOCK_COMMAND(&alloc_lock); + + if (memory_initialized == 1) { + + if (!gotoblas) gotoblas_dynamic_init(); + + memory_initialized = 2; + } + + UNLOCK_COMMAND(&alloc_lock); + + } +#endif + + +#ifdef DEBUG + printf("Mapped : %p %3d\n\n", + (void *)memory[position].addr, position); +#endif + + return (void *)memory[position].addr; + + error: + printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + + return NULL; +} + +void blas_memory_free(void *free_area){ + + int position; + +#ifdef DEBUG + printf("Unmapped Start : %p ...\n", free_area); +#endif + + position = 0; + LOCK_COMMAND(&alloc_lock); + + while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + position++; + + if (memory[position].addr != free_area) goto error; + +#ifdef DEBUG + printf(" Position : %d\n", position); +#endif + + // arm: ensure all writes are finished before other thread takes this memory + WMB; + + memory[position].used = 0; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Unmap Succeeded.\n\n"); +#endif + + return; + + error: + printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); + +#ifdef DEBUG + for (position = 0; position < NUM_BUFFERS; position++) + printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); +#endif + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +void *blas_memory_alloc_nolock(int unused) { + void *map_address; + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + return map_address; +} + +void blas_memory_free_nolock(void * map_address) { + free(map_address); +} + +void blas_shutdown(void){ + + int pos; + +#ifdef SMP + BLASFUNC(blas_thread_shutdown)(); +#endif + + LOCK_COMMAND(&alloc_lock); + + for (pos = 0; pos < release_pos; pos ++) { + release_info[pos].func(&release_info[pos]); + } + +#ifdef SEEK_ADDRESS + base_address = 0UL; +#else + base_address = BASE_ADDRESS; +#endif + + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + memory[pos].addr = (void *)0; + memory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + memory[pos].pos = -1; +#endif + memory[pos].lock = 0; + } + + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + +#ifdef SMP +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t init_lock = 0; +#else +static BLASULONG init_lock = 0UL; +#endif +#endif + +static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + void *sa, void *sb, BLASLONG pos) { + +#if !defined(ARCH_POWER) && !defined(ARCH_SPARC) + + size_t size; + BLASULONG buffer; + + size = BUFFER_SIZE - PAGESIZE; + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc != 2) { +#endif + +#ifdef SMP + LOCK_COMMAND(&init_lock); +#endif + + while (size > 0) { + *(int *)buffer = size; + buffer += PAGESIZE; + size -= PAGESIZE; + } + +#ifdef SMP + UNLOCK_COMMAND(&init_lock); +#endif + + size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + + while (size > 0) { + *(int *)buffer = size; + buffer += 64; + size -= 64; + } + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + +#endif +} + +#ifdef SMP + +static void _init_thread_memory(void *buffer) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int num_cpu; + + for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) { + + blas_queue_init(&queue[num_cpu]); + queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL; + queue[num_cpu].routine = &_touch_memory; + queue[num_cpu].args = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + } + + queue[num_cpu - 1].next = NULL; + queue[0].sa = buffer; + + exec_blas(num_cpu, queue); + +} +#endif + +static void gotoblas_memory_init(void) { + + void *buffer; + + hot_alloc = 1; + + buffer = (void *)blas_memory_alloc(0); + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif + + _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); + +#else + + _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); + +#endif + + blas_memory_free(buffer); +} +#endif + +/* Initialization for all function; this function should be called before main */ + +static int gotoblas_initialized = 0; +extern void openblas_read_env(); + +void CONSTRUCTOR gotoblas_init(void) { + + if (gotoblas_initialized) return; + +#ifdef SMP + openblas_fork_handler(); +#endif + + openblas_read_env(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + gotoblas_memory_init(); +#endif + +//#if defined(OS_LINUX) +#if 0 + struct rlimit curlimit; + if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) + { + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } + } +#endif + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_init(); +#endif + + gotoblas_initialized = 1; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +void DESTRUCTOR gotoblas_quit(void) { + + if (gotoblas_initialized == 0) return; + + blas_shutdown(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_quit(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_quit(); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_quit(); +#endif + + gotoblas_initialized = 0; + +#ifdef PROFILE + moncontrol (1); +#endif +} + +#if defined(_MSC_VER) && !defined(__clang__) +BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +{ + switch (ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + default: + break; + } + return TRUE; +} + +/* + This is to allow static linking. + Code adapted from Google performance tools: + https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc + Reference: + https://sourceware.org/ml/pthreads-win32/2008/msg00028.html + http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp +*/ +static int on_process_term(void) +{ + gotoblas_quit(); + return 0; +} +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XLB") +#else +#pragma data_seg(".CRT$XLB") +#endif +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XTU") +#else +#pragma data_seg(".CRT$XTU") +#endif +static int(*p_process_term)(void) = on_process_term; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif +#endif + +#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) +/* Don't call me; this is just work around for PGI / Sun bug */ +void gotoblas_dummy_for_PGI(void) { + + gotoblas_init(); + gotoblas_quit(); + +#if 0 + asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); + asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); +#else + asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); + asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); +#endif +} +#endif + +#endif -- 2.7.4