When NUM_THREADS(MAX_CPU_NUNBERS) is very large ,e.g. 256.
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
job_t job[MAX_CPU_NUMBER];
The job array is equal 8MB.
Thus, We use malloc instead of stack allocation.
#define SWITCH_RATIO 2
#endif
+//The array of job_t may overflow the stack.
+//Instead, use malloc to alloc job_t.
+#if MAX_CPU_NUMBER > 210
+#define USE_ALLOC_HEAP
+#endif
+
#ifndef GEMM3M_LOCAL
#if defined(NN)
#define GEMM3M_LOCAL GEMM3M_NN
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];
- job_t job[MAX_CPU_NUMBER];
+#ifndef USE_ALLOC_HEAP
+ job_t job[MAX_CPU_NUMBER];
+#else
+ job_t * job = NULL;
+#endif
BLASLONG num_cpu_m, num_cpu_n;
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;
+
+#ifdef USE_ALLOC_HEAP
+ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
+ if(job==NULL){
+ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
+ exit(1);
+ }
+#endif
+
newarg.common = (void *)job;
if (!range_m) {
exec_blas(num_cpu_m, queue);
}
+#ifdef USE_ALLOC_HEAP
+ free(job);
+#endif
+
return 0;
}
#define SWITCH_RATIO 2
#endif
+//The array of job_t may overflow the stack.
+//Instead, use malloc to alloc job_t.
+#if MAX_CPU_NUMBER > 210
+#define USE_ALLOC_HEAP
+#endif
+
#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
blas_arg_t newarg;
+#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
+#else
+ job_t * job = NULL;
+#endif
+
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
newarg.ldc = args -> ldc;
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
+
+#ifdef USE_ALLOC_HEAP
+ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
+ if(job==NULL){
+ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
+ exit(1);
+ }
+#endif
+
newarg.common = (void *)job;
if (!range_n) {
exec_blas(num_cpu, queue);
}
+#ifdef USE_ALLOC_HEAP
+ free(job);
+#endif
return 0;
}
#define SWITCH_RATIO 2
#endif
+//The array of job_t may overflow the stack.
+//Instead, use malloc to alloc job_t.
+#if MAX_CPU_NUMBER > 210
+#define USE_ALLOC_HEAP
+#endif
+
#ifndef GEMM_LOCAL
#if defined(NN)
#define GEMM_LOCAL GEMM_NN
blas_arg_t newarg;
+#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
+#else
+ job_t * job = NULL;
+#endif
+
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_M[MAX_CPU_NUMBER + 1];
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;
+
+#ifdef USE_ALLOC_HEAP
+ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
+ if(job==NULL){
+ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
+ exit(1);
+ }
+#endif
+
newarg.common = (void *)job;
#ifdef PARAMTEST
exec_blas(num_cpu_m, queue);
}
+#ifdef USE_ALLOC_HEAP
+ free(job);
+#endif
+
return 0;
}
}
return nums;
}
-
+/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
}
}
}
+*/
#endif
/*
#endif
#ifdef DYNAMIC_ARCH
-#if defined(SMP) && defined(OS_DARWIN) && MAX_CPU_NUMBER > 128
- //Set stack limit to 16MB on Mac OS X
- //when NUM_THREADS>128 and DYNAMIC_ARCH=1.
- //Prevent the SEGFAULT bug.
- set_stack_limit(16);
-#endif
gotoblas_dynamic_init();
#endif
double sqrt(double);
+//In this case, the recursive getrf_parallel may overflow the stack.
+//Instead, use malloc to alloc job_t.
+#if MAX_CPU_NUMBER > 90
+#define USE_ALLOC_HEAP
+#endif
+
#ifndef CACHE_LINE_SIZE
#define CACHE_LINE_SIZE 8
#endif
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];
+#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
+#else
+ job_t * job=NULL;
+#endif
BLASLONG width, nn, mm;
BLASLONG i, j, k, is, bk;
newarg.c = ipiv;
newarg.lda = lda;
- newarg.common = (void *)job;
info = 0;
if (iinfo && !info) info = iinfo;
+#ifdef USE_ALLOC_HEAP
+ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
+ if(job==NULL){
+ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
+ exit(1);
+ }
+#endif
+
+ newarg.common = (void *)job;
+
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
is += bk;
}
+#ifdef USE_ALLOC_HEAP
+ free(job);
+#endif
+
return info;
}
#ifndef USE_SIMPLE_THREADED_LEVEL3
+//The array of job_t may overflow the stack.
+//Instead, use malloc to alloc job_t.
+#if MAX_CPU_NUMBER > 210
+#define USE_ALLOC_HEAP
+#endif
+
+
static FLOAT dm1 = -1.;
#ifndef KERNEL_FUNC
blas_arg_t newarg;
+#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
+#else
+ job_t * job = NULL;
+#endif
+
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
newarg.c = args -> c;
newarg.lda = args -> lda;
newarg.alpha = args -> alpha;
+
+#ifdef USE_ALLOC_HEAP
+ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
+ if(job==NULL){
+ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
+ exit(1);
+ }
+#endif
+
newarg.common = (void *)job;
n_from = 0;
exec_blas(num_cpu, queue);
}
+#ifdef USE_ALLOC_HEAP
+ free(job);
+#endif
+
return 0;
}