div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
- /* Make sure if no one is using workspace */
- START_RPCC();
- for (i = 0; i < args -> nthreads; i++)
- while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
- STOP_RPCC(waiting1);
-
#if defined(FUSED_GEMM) && !defined(TIMING)
/* Fused operation to copy region of B into workspace and apply kernel */
}
#endif
- /* Set flag so other threads can access local region of B */
- for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
+ for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
+ /* Make sure if no one is using workspace */
+ START_RPCC();
+ while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
+ STOP_RPCC(waiting1);
+ /* Set flag so other threads can access local region of B */
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
- WMB;
+ WMB;
+ }
}
/* Get regions of B from other threads and apply kernel */
/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
- job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);
- /* Iterate through steps of m
+ /* Iterate through steps of m
* Note: First step has already been finished */
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, js);
STOP_RPCC(kernel);
-
+
#ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
#endif
-
+
/* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) {
- job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}