Add complete gemv function on Loongson3a platform.
authortraz <wangqian10@iscas.ac.cn>
Thu, 3 Nov 2011 13:53:48 +0000 (13:53 +0000)
committertraz <wangqian10@iscas.ac.cn>
Thu, 3 Nov 2011 13:53:48 +0000 (13:53 +0000)
kernel/mips64/KERNEL.LOONGSON3A
kernel/mips64/gemv_n_loongson3a.c [new file with mode: 0644]
kernel/mips64/gemv_t_loongson3a.c [new file with mode: 0644]
kernel/mips64/zgemv_n_loongson3a.c [new file with mode: 0644]
kernel/mips64/zgemv_t_loongson3a.c [new file with mode: 0644]

index 91f2e7d..fc247e4 100644 (file)
@@ -1,6 +1,16 @@
 SAXPYKERNEL=axpy_loongson3a.S
 DAXPYKERNEL=daxpy_loongson3a_simd.S
 
+SGEMVNKERNEL = gemv_n_loongson3a.c
+SGEMVTKERNEL = gemv_t_loongson3a.c
+DGEMVNKERNEL = gemv_n_loongson3a.c
+DGEMVTKERNEL = gemv_t_loongson3a.c
+CGEMVNKERNEL = zgemv_n_loongson3a.c
+CGEMVTKERNEL = zgemv_t_loongson3a.c
+ZGEMVNKERNEL = zgemv_n_loongson3a.c
+ZGEMVTKERNEL = zgemv_t_loongson3a.c
+
+
 SGEMMKERNEL    =  sgemm_kernel_8x4_ps.S                
 SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c
new file mode 100644 (file)
index 0000000..bb27379
--- /dev/null
@@ -0,0 +1,98 @@
+#include "common.h"
+
+//These are auto-tuning codes on Loongson-3A platform. 
+
+//#define prefetch(x) __builtin_prefetch(x)
+//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
+#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
+#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
+#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
+#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) 
+{
+
+       if(!ALPHA)
+               return 0;
+
+       if(INCX < 0)
+               INCX = -INCX;
+       if(INCY < 0)
+               INCY = -INCY;
+
+       BLASLONG fahead = 30;
+       BLASLONG spec_unroll = 4;
+       BLASLONG tMQ = M - M % spec_unroll;
+       BLASLONG j = 0, k = 0;
+
+       if(ALPHA == 1) {
+               if(INCY == 1) {
+                       for(; likely(j < N); j++, k += INCX) {
+                               BLASLONG i = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(Y[i + fahead]);
+                                       /*loop_mark*/ spec_loop_alpha1;
+                                       /*loop_mark*/ spec_loop_alpha1;
+                                       /*loop_mark*/ spec_loop_alpha1;
+                                       /*loop_mark*/ spec_loop_alpha1;
+                               }
+                               for(; likely(i < M);) {
+                                       spec_loop_alpha1;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCX) {
+                               BLASLONG i = 0, h = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(Y[h + fahead]);
+                                       /*loop_mark*/ norm_loop_alpha1;
+                                       /*loop_mark*/ norm_loop_alpha1;
+                                       /*loop_mark*/ norm_loop_alpha1;
+                                       /*loop_mark*/ norm_loop_alpha1;
+                               }
+                               for(; likely(i < M);) {
+                                       norm_loop_alpha1;
+                               }
+                       }
+               }
+       } else {
+               if(INCY == 1) {
+                       for(; likely(j < N); j++, k += INCX) {
+                               BLASLONG i = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(Y[i + fahead]);
+                                       /*loop_mark*/ spec_loop;
+                                       /*loop_mark*/ spec_loop;
+                                       /*loop_mark*/ spec_loop;
+                                       /*loop_mark*/ spec_loop;
+                               }
+                               for(; likely(i < M);) {
+                                       spec_loop;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCX) {
+                               BLASLONG i = 0, h = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(Y[h + fahead]);
+                                       /*loop_mark*/ norm_loop;
+                                       /*loop_mark*/ norm_loop;
+                                       /*loop_mark*/ norm_loop;
+                                       /*loop_mark*/ norm_loop;
+                               }
+                               for(; likely(i < M);) {
+                                       norm_loop;
+                               }
+                       }
+               }
+       }
+       return 0;
+}
diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c
new file mode 100644 (file)
index 0000000..5c6c838
--- /dev/null
@@ -0,0 +1,93 @@
+#include "common.h"
+
+//These are auto-tuning codes on Loongson-3A platform. 
+
+//#define prefetch(x) __builtin_prefetch(x)
+//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
+#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
+#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
+#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
+#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
+
+       if(!ALPHA)
+               return 0;
+
+       if(INCX < 0)
+               INCX = -INCX;
+       if(INCY < 0)
+               INCY = -INCY;
+
+       BLASLONG fahead = 30;
+       BLASLONG spec_unroll = 3;
+       BLASLONG tMQ = M - M % spec_unroll;
+       BLASLONG j = 0, k = 0;
+
+       if(ALPHA == 1) {
+               if(INCX == 1) {
+                       for(; likely(j < N); j++, k += INCY) {
+                               BLASLONG i = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(X[i + fahead]);
+                                       /*loop_mark*/ spec_loop_alpha1;
+                                       /*loop_mark*/ spec_loop_alpha1;
+                                       /*loop_mark*/ spec_loop_alpha1;
+                               }
+                               for(; likely(i < M);) {
+                                       spec_loop_alpha1;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCY) {
+                               BLASLONG i = 0, h = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(X[h + fahead]);
+                                       /*loop_mark*/ norm_loop_alpha1;
+                                       /*loop_mark*/ norm_loop_alpha1;
+                                       /*loop_mark*/ norm_loop_alpha1;
+                               }
+                               for(; likely(i < M);) {
+                                       norm_loop_alpha1;
+                               }
+                       }
+               }
+       } else {
+               if(INCX == 1) {
+                       for(; likely(j < N); j++, k += INCY) {
+                               BLASLONG i = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(X[i + fahead]);
+                                       /*loop_mark*/ spec_loop;
+                                       /*loop_mark*/ spec_loop;
+                                       /*loop_mark*/ spec_loop;
+                               }
+                               for(; likely(i < M);) {
+                                       spec_loop;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCY) {
+                               BLASLONG i = 0, h = 0;
+                               for(; likely(i < tMQ);) {
+                                       prefetch(A[LDA * j + i + fahead]);
+                                       prefetch(X[h + fahead]);
+                                       /*loop_mark*/ norm_loop;
+                                       /*loop_mark*/ norm_loop;
+                                       /*loop_mark*/ norm_loop;
+                               }
+                               for(; likely(i < M);) {
+                                       norm_loop;
+                               }
+                       }
+               }
+       }
+       return 0;
+}
diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c
new file mode 100644 (file)
index 0000000..f8275c3
--- /dev/null
@@ -0,0 +1,92 @@
+#include "common.h"
+
+//These are auto-tuning codes on Loongson-3A platform. 
+
+//#define prefetch(x) __builtin_prefetch(x)
+//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
+#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
+//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0)
+#define spec_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
+#define norm_loop_alpha1 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
+#define norm_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
+
+       if(!rALPHA && iALPHA)
+               return 0;
+
+       if(INCX < 0)
+               INCX = -INCX;
+       if(INCY < 0)
+               INCY = -INCY;
+
+       BLASLONG fahead = 60;
+       BLASLONG spec_unroll = 2;
+       BLASLONG tMQ = M - M % spec_unroll;
+       BLASLONG j = 0, k = 0, jj=0;
+
+
+       if(rALPHA == 1 && iALPHA == 0) {
+               if(INCY == 1) {
+                       for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(Y[ii + fahead]);
+                                       /*loop_mark*/ spec_loop_alpha1;
+                                       /*loop_mark*/ spec_loop_alpha1;
+                               }
+                               for(; likely(i < M); i++) {
+                                       spec_loop_alpha1;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0, iii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(Y[iii + fahead]);
+                                       /*loop_mark*/ norm_loop_alpha1;
+                                       /*loop_mark*/ norm_loop_alpha1;
+                               }
+                               for(; likely(i < M); i++) {
+                                       norm_loop_alpha1;
+                               }
+                       }
+               }
+       } else {
+               FLOAT rTmp, iTmp;
+               if(INCY == 1) {
+                       for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(Y[ii + fahead]);
+                                       /*loop_mark*/ spec_loop;
+                                       /*loop_mark*/ spec_loop;
+                               }
+                               for(; likely(i < M); i++) {
+                                       spec_loop;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0, iii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(Y[iii + fahead]);
+                                       /*loop_mark*/ norm_loop;
+                                       /*loop_mark*/ norm_loop;
+                               }
+                               for(; likely(i < M); i++) {
+                                       norm_loop;
+                               }
+                       }
+               }
+       }
+       return 0;
+}
diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c
new file mode 100644 (file)
index 0000000..4b2c2b6
--- /dev/null
@@ -0,0 +1,91 @@
+#include "common.h"
+
+//These are auto-tuning codes on Loongson-3A platform. 
+//#define prefetch(x) __builtin_prefetch(x)
+//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
+#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+#define spec_loop_alpha1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
+//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0)
+#define spec_loop do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
+#define norm_loop_alpha1 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
+#define norm_loop do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
+
+       if(!rALPHA && iALPHA)
+               return 0;
+
+       if(INCX < 0)
+               INCX = -INCX;
+       if(INCY < 0)
+               INCY = -INCY;
+
+       BLASLONG fahead = 30;
+       BLASLONG spec_unroll = 2;
+       BLASLONG tMQ = M - M % spec_unroll;
+       BLASLONG j = 0, k = 0, jj=0;
+
+
+       if(rALPHA == 1 && iALPHA == 0) {
+               if(INCX == 1) {
+                       for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(X[ii + fahead]);
+                                       /*loop_mark*/ spec_loop_alpha1;
+                                       /*loop_mark*/ spec_loop_alpha1;
+                               }
+                               for(; likely(i < M); i++) {
+                                       spec_loop_alpha1;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0, iii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(X[iii + fahead]);
+                                       /*loop_mark*/ norm_loop_alpha1;
+                                       /*loop_mark*/ norm_loop_alpha1;
+                               }
+                               for(; likely(i < M); i++) {
+                                       norm_loop_alpha1;
+                               }
+                       }
+               }
+       } else {
+               FLOAT rTmp, iTmp;
+               if(INCX == 1) {
+                       for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(X[ii + fahead]);
+                                       /*loop_mark*/ spec_loop;
+                                       /*loop_mark*/ spec_loop;
+                               }
+                               for(; likely(i < M); i++) {
+                                       spec_loop;
+                               }
+                       }
+               } else {
+                       for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
+                               BLASLONG i = 0, ii = 0, iii = 0;
+                               for(; likely(i < tMQ); i += spec_unroll) {
+                                       prefetch(A[jj + ii + fahead]);
+                                       prefetch(X[iii + fahead]);
+                                       /*loop_mark*/ norm_loop;
+                                       /*loop_mark*/ norm_loop;
+                               }
+                               for(; likely(i < M); i++) {
+                                       norm_loop;
+                               }
+                       }
+               }
+       }
+       return 0;
+}