Adding performance patch for trmm, just like #2836
authorfossum <fossum@us.ibm.com>
Tue, 15 Sep 2020 13:59:50 +0000 (08:59 -0500)
committerfossum <fossum@us.ibm.com>
Tue, 15 Sep 2020 13:59:50 +0000 (08:59 -0500)
driver/level3/trmm_L.c
driver/level3/trmm_R.c

index 1027c0c..ae8435d 100644 (file)
@@ -139,7 +139,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
       /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
       if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-      if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+      if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
       else
        if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -209,7 +209,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-        if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+        if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
         else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
       /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
       if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-      if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+      if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
       else
         if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -374,7 +374,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-        if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+        if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
         else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
index e8df7fb..3be43ed 100644 (file)
@@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+       if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
        else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -150,7 +150,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+       if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
        else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -207,7 +207,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+       if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
        else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -262,7 +262,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+       if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
        else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+       if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
        else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
@@ -348,7 +348,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
        if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
-       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
+       if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
        else
          if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif