#define SWITCH_RATIO 2
#endif
+#ifndef GEMM_PREFERED_SIZE
+#define GEMM_PREFERED_SIZE 1
+#endif
+
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
return 0;
}
+static int round_up(int remainder, int width, int multiple)
+{
+ if (multiple > remainder || width <= multiple)
+ return width;
+ width = (width + multiple - 1) / multiple;
+ width = width * multiple;
+ return width;
+}
+
+
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) {
num_parts = 0;
while (m > 0){
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
+
+ width = round_up(m, width, GEMM_PREFERED_SIZE);
+
m -= width;
+
if (m < 0) width = width + m;
range_M[num_parts + 1] = range_M[num_parts] + width;
+
num_parts ++;
}
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
if (width < SWITCH_RATIO) {
width = SWITCH_RATIO;
}
+ width = round_up(n, width, GEMM_PREFERED_SIZE);
+
n -= width;
if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width;
+
num_parts ++;
}
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {