prepared driver/level3 functions for UNROLL values, that are not a power of two
authorWerner Saar <wernsaar@googlemail.com>
Mon, 9 Jan 2017 09:38:15 +0000 (10:38 +0100)
committerWerner Saar <wernsaar@googlemail.com>
Mon, 9 Jan 2017 09:38:15 +0000 (10:38 +0100)
driver/level3/gemm3m_level3.c
driver/level3/level3.c
driver/level3/level3_gemm3m_thread.c
driver/level3/level3_syr2k.c
driver/level3/level3_syrk.c
driver/level3/level3_syrk_threaded.c
driver/level3/level3_thread.c
driver/level3/syrk_kernel.c
driver/level3/zher2k_kernel.c
driver/level3/zherk_kernel.c

index 0649682..bbde7e5 100644 (file)
@@ -316,7 +316,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        if (min_l > GEMM3M_Q) {
          min_l = (min_l + 1) / 2;
 #ifdef UNROLL_X
-         min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
+         min_l = ((min_l + UNROLL_X - 1)/UNROLL_X) * UNROLL_X;
 #endif
        }
       }
@@ -326,7 +326,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM3M_P;
       } else {
        if (min_i > GEMM3M_P) {
-         min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+         min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
        }
       }
 
@@ -365,7 +365,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
          min_i = GEMM3M_P;
        } else
          if (min_i > GEMM3M_P) {
-           min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+           min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
          }
 
        START_RPCC();
@@ -386,7 +386,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM3M_P;
       } else {
        if (min_i > GEMM3M_P) {
-         min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+         min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
        }
       }
 
@@ -429,7 +429,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
          min_i = GEMM3M_P;
        } else
          if (min_i > GEMM3M_P) {
-           min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+           min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
          }
 
        START_RPCC();
@@ -451,7 +451,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM3M_P;
       } else {
        if (min_i > GEMM3M_P) {
-         min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+         min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
        }
       }
 
@@ -494,7 +494,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
          min_i = GEMM3M_P;
        } else
          if (min_i > GEMM3M_P) {
-           min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+           min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
          }
 
        START_RPCC();
index 1ede8a2..0ee189a 100644 (file)
@@ -297,9 +297,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_l  = GEMM_Q;
       } else {
        if (min_l > GEMM_Q) {
-         min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
+         min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
        }
-       gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1));
+       gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
        while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
       }
 
@@ -311,7 +311,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM_P;
       } else {
        if (min_i > GEMM_P) {
-         min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
+         min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
        } else {
          l1stride = 0;
        }
@@ -369,7 +369,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
          min_i = GEMM_P;
        } else
          if (min_i > GEMM_P) {
-           min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
+           min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
          }
 
        START_RPCC();
index 02bf57e..3400666 100644 (file)
@@ -365,7 +365,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
   buffer[0] = sb;
   for (i = 1; i < DIVIDE_RATE; i++) {
-    buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
+    buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N);
   }
 
   for(ls = 0; ls < k; ls += min_l){
@@ -384,7 +384,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       min_i = GEMM3M_P;
     } else {
       if (min_i > GEMM3M_P) {
-       min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+       min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
       }
     }
 
@@ -482,7 +482,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM3M_P;
       } else
        if (min_i > GEMM3M_P) {
-         min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+         min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
        }
 
       START_RPCC();
@@ -618,7 +618,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM3M_P;
       } else
        if (min_i > GEMM3M_P) {
-         min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+         min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
        }
 
       START_RPCC();
@@ -754,7 +754,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM3M_P;
       } else
        if (min_i > GEMM3M_P) {
-         min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
+         min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
        }
 
       START_RPCC();
index a75d379..8bdd921 100644 (file)
@@ -189,7 +189,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        min_i = GEMM_P;
       } else
        if (min_i > GEMM_P) {
-         min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+         min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
        }
 
 #ifndef LOWER
@@ -230,7 +230,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
          min_i = GEMM_P;
        } else
          if (min_i > GEMM_P) {
-           min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+           min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
          }
 
        ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
@@ -245,7 +245,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        min_i = GEMM_P;
       } else
        if (min_i > GEMM_P) {
-         min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+         min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
        }
 
       if (m_start >= js) {
@@ -284,7 +284,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
          min_i = GEMM_P;
        } else
          if (min_i > GEMM_P) {
-           min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+           min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
          }
 
        ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
@@ -322,7 +322,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
          min_i = GEMM_P;
        } else
          if (min_i > GEMM_P) {
-           min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+           min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
          }
 
        aa = sb + min_l * (is - js) * COMPSIZE;
@@ -353,7 +353,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        min_i = GEMM_P;
       } else
        if (min_i > GEMM_P) {
-         min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+         min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
        }
 
       aa = sb + min_l * (m_start - js) * COMPSIZE;
@@ -383,7 +383,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
          min_i = GEMM_P;
        } else
          if (min_i > GEMM_P) {
-           min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+           min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
          }
 
        aa = sb + min_l * (is - js) * COMPSIZE;
index ba544a0..f3202eb 100644 (file)
@@ -198,7 +198,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
        min_i = GEMM_P;
       } else
        if (min_i > GEMM_P) {
-         min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+         min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
        }
 
 #ifndef LOWER
@@ -239,7 +239,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
            min_i = GEMM_P;
          } else
            if (min_i > GEMM_P) {
-             min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+             min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
            }
 
          aa = sb + min_l * (is - js)  * COMPSIZE;
@@ -303,7 +303,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
            min_i = GEMM_P;
          } else
            if (min_i > GEMM_P) {
-             min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+             min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
            }
 
          START_RPCC();
@@ -375,7 +375,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
            min_i = GEMM_P;
          } else
            if (min_i > GEMM_P) {
-             min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+             min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
            }
 
          if (is  < js + min_j) {
@@ -460,7 +460,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
            min_i = GEMM_P;
          } else
            if (min_i > GEMM_P) {
-             min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+             min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
            }
 
          START_RPCC();
index 5119baa..6673289 100644 (file)
@@ -210,8 +210,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
   fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",  mypos, m_from, m_to, n_from, n_to);
 #endif
 
-  div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
-                                   + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+  div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
 
   buffer[0] = sb;
   for (i = 1; i < DIVIDE_RATE; i++) {
@@ -233,7 +232,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       min_i = GEMM_P;
     } else {
       if (min_i > GEMM_P) {
-       min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+       min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
       }
     }
 
@@ -253,8 +252,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
     STOP_RPCC(copy_A);
 
-    div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
-                                     + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+    div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
 
     for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
 
@@ -353,9 +351,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
     while (current >= 0) {
 #endif
 
-       div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
-                + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
-
+       div_n = (((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
        for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
 
          START_RPCC();
@@ -412,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM_P;
       } else
        if (min_i > GEMM_P) {
-         min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+         min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
        }
 
       START_RPCC();
@@ -425,8 +422,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       do {
 
-       div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
-                                                                    + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
+       div_n = (((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
 
        for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
 
@@ -602,9 +598,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       double di   = (double)i;
 
-      width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
+      width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) );
 
-      if (num_cpu == 0) width = n - ((n - width) & ~mask);
+      if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) );
 
       if ((width > n - i) || (width < mask)) width = n - i;
 
@@ -644,7 +640,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
        double di   = (double)i;
 
-       width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
+       width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
 
       if ((width > n - i) || (width < mask)) width = n - i;
 
index 0382743..fec873e 100644 (file)
@@ -310,7 +310,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
   buffer[0] = sb;
   for (i = 1; i < DIVIDE_RATE; i++) {
-    buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
+    buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
   }
 
 
@@ -331,7 +331,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       min_i = GEMM_P;
     } else {
       if (min_i > GEMM_P) {
-       min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
+       min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
       } else {
        if (args -> nthreads == 1) l1stride = 0;
       }
@@ -443,7 +443,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        min_i = GEMM_P;
       } else
        if (min_i > GEMM_P) {
-         min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
+         min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
        }
 
       START_RPCC();
index 434d2f6..6f224d0 100644 (file)
@@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
 
     int mm, nn;
 
-    mm = (loop & ~(GEMM_UNROLL_MN - 1));
+    mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
     nn = MIN(GEMM_UNROLL_MN, n - loop);
 
 #ifndef LOWER
index 92aef88..f67e9bd 100644 (file)
@@ -149,7 +149,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
 
     int mm, nn;
 
-    mm = (loop & ~(GEMM_UNROLL_MN - 1));
+    mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
     nn = MIN(GEMM_UNROLL_MN, n - loop);
 
 #ifndef LOWER
index e4c9e27..cebcc16 100644 (file)
@@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
 
     int mm, nn;
 
-    mm = (loop & ~(GEMM_UNROLL_MN - 1));
+    mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
     nn = MIN(GEMM_UNROLL_MN, n - loop);
 
 #ifndef LOWER