Update level3_thread.c
authorwjc404 <52632443+wjc404@users.noreply.github.com>
Tue, 4 Feb 2020 12:33:08 +0000 (20:33 +0800)
committerGitHub <noreply@github.com>
Tue, 4 Feb 2020 12:33:08 +0000 (20:33 +0800)
driver/level3/level3_thread.c

index cfbff75..bf55844 100644 (file)
@@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       /* Split local region of B into parts */
       for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
        min_jj = MIN(n_to, js + div_n) - jjs;
+#ifdef SKYLAKEX
+       /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+       if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
        if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
        else
           if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
           else
             if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
         /* Copy part of local region of B into workspace */
        START_RPCC();
        OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,