a = (FLOAT *)args -> a;
x = (FLOAT *)args -> b;
- y = (FLOAT *)args -> c;
lda = args -> lda;
incx = args -> ldb;
n_from = 0;
n_to = n;
+ //Use y as each thread's n* COMPSIZE elements in sb buffer
+ y = buffer;
+ buffer += ((COMPSIZE * n + 1023) & ~1023);
+
if (range_m) {
n_from = *(range_m + 0);
n_to = *(range_m + 1);
a += n_from * lda * COMPSIZE;
}
- if (range_n) y += *range_n * COMPSIZE;
if (incx != 1) {
COPY_K(n, x, incx, buffer, 1);
if (num_cpu) {
queue[0].sa = NULL;
- queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
+ queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
#else
ONE, ZERO,
#endif
- buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
+ (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
}
AXPYU_K(n, 0, 0,