Rewrite &= -> = and simplify the initial blocking phase.

author Craig Donner <cdonner@google.com>

Mon, 25 Jun 2018 12:53:11 +0000 (13:53 +0100)

committer Craig Donner <cdonner@google.com>

Mon, 25 Jun 2018 14:08:55 +0000 (15:08 +0100)
author Craig Donner <cdonner@google.com>
Mon, 25 Jun 2018 12:53:11 +0000 (13:53 +0100)
committer Craig Donner <cdonner@google.com>
Mon, 25 Jun 2018 14:08:55 +0000 (15:08 +0100)
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c

index aeb5e6e..ee3e3b9 100644 (file)
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
      for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
  
-      /* Make sure if no one is using workspace */
-      START_RPCC();
-      for (i = 0; i < args -> nthreads; i++)
-       while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
-      STOP_RPCC(waiting1);
-
  #if defined(FUSED_GEMM) && !defined(TIMING)
  
        /* Fused operation to copy region of B into workspace and apply kernel */
@@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
        }
  #endif
  
-      /* Set flag so other threads can access local region of B */
-      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
+      for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
+        /* Make sure if no one is using workspace */
+        START_RPCC();
+        while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
+        STOP_RPCC(waiting1);
+        /* Set flag so other threads can access local region of B */
          job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
-      WMB;
+        WMB;
+      }
      }
  
      /* Get regions of B from other threads and apply kernel */
@@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
  
          /* Clear synchronization flag if this thread is done with other region of B */
         if (m_to - m_from == min_i) {
-         job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+         job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
           WMB;
         }
        }
      } while (current != mypos);
  
-    /* Iterate through steps of m 
+    /* Iterate through steps of m
       * Note: First step has already been finished */
      for(is = m_from + min_i; is < m_to; is += min_i){
        min_i = m_to - is;
@@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
                            sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
                            c, ldc, is, js);
            STOP_RPCC(kernel);
-          
+
  #ifdef TIMING
            ops += 2 * min_i * MIN(range_n[current + 1]  - js, div_n) * min_l;
  #endif
-          
+
            /* Clear synchronization flag if this thread is done with region of B */
            if (is + min_i >= m_to) {
-            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+            job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
              WMB;
            }
         }
author	Craig Donner <cdonner@google.com>
	Mon, 25 Jun 2018 12:53:11 +0000 (13:53 +0100)
committer	Craig Donner <cdonner@google.com>
	Mon, 25 Jun 2018 14:08:55 +0000 (15:08 +0100)