#define GETRF_FACTOR 1.00
-#if defined(USE_PTHREAD_LOCK)
-static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
-#elif defined(USE_PTHREAD_SPINLOCK)
-static pthread_spinlock_t getrf_lock = 0;
+#if (__STDC_VERSION__ >= 201112L)
+#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED)
+#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
#else
-static BLASULONG getrf_lock = 0UL;
-#endif
-
-#if defined(USE_PTHREAD_LOCK)
-static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
-#elif defined(USE_PTHREAD_SPINLOCK)
-static pthread_spinlock_t getrf_flag_lock = 0;
-#else
-static BLASULONG getrf_flag_lock = 0UL;
+#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p))
+#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v)
#endif
-
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
double m = (double)(M - IS - BK);
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
FLOAT *sbb = sb;
-#if __STDC_VERSION__ >= 201112L
- _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
-#else
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
-#endif
blasint *ipiv = (blasint *)args -> c;
}
}
- if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0;
+ if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) {
+ MB;
+ atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
+ }
for (is = 0; is < m; is += GEMM_P){
min_i = m - is;
/* Non blocking implementation */
typedef struct {
-#if __STDC_VERSION__ >= 201112L
- _Atomic
-#else
- volatile
-#endif
- BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
+ volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
+
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
blasint *ipiv = (blasint *)args -> c;
BLASLONG jw;
-#if __STDC_VERSION__ >= 201112L
- _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
-#else
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
-#endif
+
if (args -> a == NULL) {
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
#if 1
{
do {
- LOCK_COMMAND(&getrf_lock);
- jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
- UNLOCK_COMMAND(&getrf_lock);
+ jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside]);
} while (jw);
+ MB;
}
#else
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
}
MB;
for (i = 0; i < args -> nthreads; i++) {
- LOCK_COMMAND(&getrf_lock);
- job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
- UNLOCK_COMMAND(&getrf_lock);
+ atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
}
}
- LOCK_COMMAND(&getrf_flag_lock);
- flag[mypos * CACHE_LINE_SIZE] = 0;
- UNLOCK_COMMAND(&getrf_flag_lock);
+ MB;
+ atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
if (m == 0) {
+ MB;
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
- LOCK_COMMAND(&getrf_lock);
- job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
- UNLOCK_COMMAND(&getrf_lock);
+ atomic_store_long(&job[mypos].working[mypos][CACHE_LINE_SIZE * xxx], 0);
}
}
if ((current != mypos) && (!is)) {
#if 1
do {
- LOCK_COMMAND(&getrf_lock);
- jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
- UNLOCK_COMMAND(&getrf_lock);
- } while (jw == 0);
+ jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
+ } while (jw == 0);
+ MB;
#else
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
#endif
MB;
if (is + min_i >= m) {
- LOCK_COMMAND(&getrf_lock);
- job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
- UNLOCK_COMMAND(&getrf_lock);
+ atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], 0);
}
}
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
#if 1
do {
- LOCK_COMMAND(&getrf_lock);
- jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
- UNLOCK_COMMAND(&getrf_lock);
+ jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE *xxx]);
} while(jw != 0);
+ MB;
#else
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
#endif
#ifdef _MSC_VER
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
#else
-#if __STDC_VERSION__ >= 201112L
- _Atomic
-#else
- volatile
-#endif
- BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
+ volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#endif
#ifndef COMPLEX
if (width > mn - is - bk) width = mn - is - bk;
}
- if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);
+
+ if (num_cpu > 0) {
+ WMB;
+ exec_blas_async_wait(num_cpu, &queue[0]);
+ }
mm = m - bk - is;
nn = n - bk - is;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
- flag[num_cpu * CACHE_LINE_SIZE] = 1;
+ atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
num_cpu ++;
if (num_cpu > 0) {
queue[num_cpu - 1].next = NULL;
+ WMB;
+
exec_blas_async(0, &queue[0]);
inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
for (i = 0; i < num_cpu; i ++) {
#if 1
- LOCK_COMMAND(&getrf_flag_lock);
- f=flag[i*CACHE_LINE_SIZE];
- UNLOCK_COMMAND(&getrf_flag_lock);
- while (f!=0) {
- LOCK_COMMAND(&getrf_flag_lock);
- f=flag[i*CACHE_LINE_SIZE];
- UNLOCK_COMMAND(&getrf_flag_lock);
- };
+ do {
+ f = atomic_load_long(&flag[i*CACHE_LINE_SIZE]);
+ } while (f != 0);
+ MB;
#else
while (flag[i*CACHE_LINE_SIZE]) {};
#endif
BLASLONG range[MAX_CPU_NUMBER + 1];
BLASLONG width, nn, num_cpu;
-#if __STDC_VERSION__ >= 201112L
- _Atomic
-#else
- volatile
-#endif
- BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
+ volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#ifndef COMPLEX
#ifdef XDOUBLE
nn = n - bk - is;
if (width > nn) width = nn;
+ WMB;
+
if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]);
range[0] = 0;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
- flag[num_cpu * CACHE_LINE_SIZE] = 1;
+ atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
num_cpu ++;
}
range_n_new[0] = offset + is;
range_n_new[1] = offset + is + bk;
+ WMB;
if (num_cpu > 1) {
exec_blas_async(1, &queue[1]);
#endif
- for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
+ for (i = 1; i < num_cpu; i ++) while (atomic_load_long(&flag[i * CACHE_LINE_SIZE])) {};
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);