From f405b5bcc545366005ba9f4cdbd5acd3856679cc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 18 Mar 2011 23:05:56 +0000 Subject: [PATCH] Fixed the bug about Loongson3A gsLQC1 & gsSQC1 instructions in daxpy kernel. Now daxpy is correct. --- kernel/mips64/daxpy_loongson3a_simd.S | 235 +++++++++++++++++++++------------- 1 file changed, 143 insertions(+), 92 deletions(-) diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index 9a0b8f1..8f53441 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -228,20 +228,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L11: //X & Y algin - gsLQC1(X_BASE,A2,A1,0*SIZE) - gsLQC1(X_BASE,A4,A3,2*SIZE) - gsLQC1(X_BASE,A6,A5,4*SIZE) - gsLQC1(X_BASE,A8,A7,6*SIZE) - - gsLQC1(X_BASE,A10,A9,8*SIZE) - gsLQC1(X_BASE,A12,A11,10*SIZE) - gsLQC1(X_BASE,A14,A13,12*SIZE) - gsLQC1(X_BASE,A16,A15,14*SIZE) - - gsLQC1(Y_BASE,B2,B1,0*SIZE) - gsLQC1(Y_BASE,B4,B3,2*SIZE) - gsLQC1(Y_BASE,B6,B5,4*SIZE) - gsLQC1(Y_BASE,B8,B7,6*SIZE) + gsLQC1(X_BASE,A2,A1,0) + gsLQC1(X_BASE,A4,A3,1) + gsLQC1(X_BASE,A6,A5,2) + gsLQC1(X_BASE,A8,A7,3) + + gsLQC1(X_BASE,A10,A9,4) + gsLQC1(X_BASE,A12,A11,5) + gsLQC1(X_BASE,A14,A13,6) + gsLQC1(X_BASE,A16,A15,7) + + gsLQC1(Y_BASE,B2,B1,0) + gsLQC1(Y_BASE,B4,B3,1) + gsLQC1(Y_BASE,B6,B5,2) + gsLQC1(Y_BASE,B8,B7,3) blez I, .L13 NOP @@ -251,65 +251,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 - gsSQC1(Y_BASE, T2, T1, 0*SIZE) - gsLQC1(Y_BASE,B2,B1,8*SIZE) + gsSQC1(Y_BASE, T2, T1, 0) + gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 - gsSQC1(Y_BASE, T4, T3, 2*SIZE) - gsLQC1(Y_BASE,B4,B3,10*SIZE) + gsSQC1(Y_BASE, T4, T3, 1) + gsLQC1(Y_BASE,B4,B3,5) PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 - gsSQC1(Y_BASE, T2, T1, 4*SIZE) - gsLQC1(Y_BASE,B6,B5,12*SIZE) + gsSQC1(Y_BASE, T2, T1, 2) + gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 - gsSQC1(Y_BASE, T4, T3, 6*SIZE) - gsLQC1(Y_BASE,B8,B7,14*SIZE) + gsSQC1(Y_BASE, T4, T3, 3) + gsLQC1(Y_BASE,B8,B7, 7) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 - gsSQC1(Y_BASE, T2, T1, 8*SIZE) - gsLQC1(Y_BASE,B2,B1,16*SIZE) + gsSQC1(Y_BASE, T2, T1, 4) + gsLQC1(Y_BASE,B2,B1,8) MADD t3, b3, ALPHA, a11 MADD t4, b4, ALPHA, a12 - gsSQC1(Y_BASE, T4, T3, 10*SIZE) - gsLQC1(Y_BASE,B4,B3,18*SIZE) + gsSQC1(Y_BASE, T4, T3, 5) + gsLQC1(Y_BASE,B4,B3,9) PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 - gsSQC1(Y_BASE, T2, T1, 12*SIZE) - gsLQC1(Y_BASE,B6,B5,20*SIZE) + gsSQC1(Y_BASE, T2, T1, 6) + gsLQC1(Y_BASE,B6,B5,10) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 - gsSQC1(Y_BASE, T4, T3, 14*SIZE) - gsLQC1(Y_BASE,B8,B7,22*SIZE) + gsSQC1(Y_BASE, T4, T3, 7) + gsLQC1(Y_BASE,B8,B7,11) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) - gsLQC1(X_BASE,A2,A1,16*SIZE) - gsLQC1(X_BASE,A4,A3,18*SIZE) - gsLQC1(X_BASE,A6,A5,20*SIZE) - gsLQC1(X_BASE,A8,A7,22*SIZE) + gsLQC1(X_BASE,A2,A1,8) + gsLQC1(X_BASE,A4,A3,9) + gsLQC1(X_BASE,A6,A5,10) + gsLQC1(X_BASE,A8,A7,11) - gsLQC1(X_BASE,A10,A9,24*SIZE) - gsLQC1(X_BASE,A12,A11,26*SIZE) - gsLQC1(X_BASE,A14,A13,28*SIZE) - gsLQC1(X_BASE,A16,A15,30*SIZE) + gsLQC1(X_BASE,A10,A9,12) + gsLQC1(X_BASE,A12,A11,13) + gsLQC1(X_BASE,A14,A13,14) + gsLQC1(X_BASE,A16,A15,15) daddiu I, I, -1 @@ -324,44 +324,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 - gsSQC1(Y_BASE, T2, T1, 0*SIZE) - gsLQC1(Y_BASE,B2,B1,8*SIZE) + gsSQC1(Y_BASE, T2, T1, 0) + gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 - gsSQC1(Y_BASE, T4, T3, 2*SIZE) - gsLQC1(Y_BASE,B4,B3,10*SIZE) + gsSQC1(Y_BASE, T4, T3, 1) + gsLQC1(Y_BASE,B4,B3,5) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 - gsSQC1(Y_BASE, T2, T1, 4*SIZE) - gsLQC1(Y_BASE,B6,B5,12*SIZE) + gsSQC1(Y_BASE, T2, T1, 2) + gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 - gsSQC1(Y_BASE, T4, T3, 6*SIZE) - gsLQC1(Y_BASE,B8,B7,14*SIZE) + gsSQC1(Y_BASE, T4, T3, 3) + gsLQC1(Y_BASE,B8,B7,7) MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 - gsSQC1(Y_BASE, T2, T1, 8*SIZE) + gsSQC1(Y_BASE, T2, T1, 4) MADD t3, b3, ALPHA, a11 MADD t4, b4, ALPHA, a12 - gsSQC1(Y_BASE, T4, T3, 10*SIZE) + gsSQC1(Y_BASE, T4, T3, 5) MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 - gsSQC1(Y_BASE, T2, T1, 12*SIZE) + gsSQC1(Y_BASE, T2, T1, 6) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 - gsSQC1(Y_BASE, T4, T3, 14*SIZE) + gsSQC1(Y_BASE, T4, T3, 7) daddiu X, X, 16 * SIZE @@ -415,97 +415,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //unloop 16 LD a1, 0 * SIZE(X) - gsLQC1(X_BASE,A3,A2,1*SIZE) - gsLQC1(X_BASE,A5,A4,3*SIZE) - gsLQC1(X_BASE,A7,A6,5*SIZE) - gsLQC1(X_BASE,A9,A8,7*SIZE) - - gsLQC1(X_BASE,A11,A10,8*SIZE) - gsLQC1(X_BASE,A13,A12,11*SIZE) - gsLQC1(X_BASE,A15,A14,13*SIZE) - LD a16, 15 * SIZE(X) + daddiu X, X, SIZE + gsLQC1(X_BASE,A3,A2,0) + gsLQC1(X_BASE,A5,A4,1) + gsLQC1(X_BASE,A7,A6,2) + gsLQC1(X_BASE,A9,A8,3) + + gsLQC1(X_BASE,A11,A10,4) + gsLQC1(X_BASE,A13,A12,5) + gsLQC1(X_BASE,A15,A14,6) + LD a16, 14 * SIZE(X) + - gsLQC1(Y_BASE,B2,B1,0*SIZE) - gsLQC1(Y_BASE,B4,B3,2*SIZE) - gsLQC1(Y_BASE,B6,B5,4*SIZE) - gsLQC1(Y_BASE,B8,B7,6*SIZE) + gsLQC1(Y_BASE,B2,B1,0) + gsLQC1(Y_BASE,B4,B3,1) + gsLQC1(Y_BASE,B6,B5,2) + gsLQC1(Y_BASE,B8,B7,3) - blez I, .L13 + blez I, .L32 NOP .align 5 .L31: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 - gsSQC1(Y_BASE, T2, T1, 0*SIZE) - gsLQC1(Y_BASE,B2,B1,8*SIZE) + gsSQC1(Y_BASE, T2, T1, 0) + gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 - gsSQC1(Y_BASE, T4, T3, 2*SIZE) - gsLQC1(Y_BASE,B4,B3,10*SIZE) + gsSQC1(Y_BASE, T4, T3, 1) + gsLQC1(Y_BASE,B4,B3,5) PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 - gsSQC1(Y_BASE, T2, T1, 4*SIZE) - gsLQC1(Y_BASE,B6,B5,12*SIZE) + gsSQC1(Y_BASE, T2, T1, 2) + gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 - gsSQC1(Y_BASE, T4, T3, 6*SIZE) - gsLQC1(Y_BASE,B8,B7,14*SIZE) + gsSQC1(Y_BASE, T4, T3, 3) + gsLQC1(Y_BASE,B8,B7,7) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 - gsSQC1(Y_BASE, T2, T1, 8*SIZE) - gsLQC1(Y_BASE,B2,B1,16*SIZE) + gsSQC1(Y_BASE, T2, T1, 4) + gsLQC1(Y_BASE,B2,B1,8) MADD t3, b3, ALPHA, a11 MADD t4, b4, ALPHA, a12 - gsSQC1(Y_BASE, T4, T3, 10*SIZE) - gsLQC1(Y_BASE,B4,B3,18*SIZE) + gsSQC1(Y_BASE, T4, T3, 5) + gsLQC1(Y_BASE,B4,B3,9) PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 - gsSQC1(Y_BASE, T2, T1, 12*SIZE) - gsLQC1(Y_BASE,B6,B5,20*SIZE) + gsSQC1(Y_BASE, T2, T1, 6) + gsLQC1(Y_BASE,B6,B5,10) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 - gsSQC1(Y_BASE, T4, T3, 14*SIZE) - gsLQC1(Y_BASE,B8,B7,22*SIZE) + gsSQC1(Y_BASE, T4, T3, 7) + gsLQC1(Y_BASE,B8,B7,11) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) - LD a1, 16 * SIZE(X) - gsLQC1(X_BASE,A3,A2,17*SIZE) - gsLQC1(X_BASE,A5,A4,19*SIZE) - gsLQC1(X_BASE,A7,A6,21*SIZE) - gsLQC1(X_BASE,A9,A8,23*SIZE) + LD a1, 15 * SIZE(X) + gsLQC1(X_BASE,A3,A2,8) + gsLQC1(X_BASE,A5,A4,9) + gsLQC1(X_BASE,A7,A6,10) + gsLQC1(X_BASE,A9,A8,11) - gsLQC1(X_BASE,A11,A10,25*SIZE) - gsLQC1(X_BASE,A13,A12,27*SIZE) - gsLQC1(X_BASE,A15,A14,29*SIZE) - LD a16, 31 * SIZE(X) + gsLQC1(X_BASE,A11,A10,12) + gsLQC1(X_BASE,A13,A12,13) + gsLQC1(X_BASE,A15,A14,14) + LD a16, 30 * SIZE(X) daddiu I, I, -1 daddiu Y, Y, 16 * SIZE daddiu X, X, 16 * SIZE bgtz I, .L31 - - //jump back to the remain loop process. - b .L13 + + .align 5 +//Loop end: +.L32: + + MADD t1, b1, ALPHA, a1 + MADD t2, b2, ALPHA, a2 + gsSQC1(Y_BASE, T2, T1, 0) + gsLQC1(Y_BASE,B2,B1,4) + + MADD t3, b3, ALPHA, a3 + MADD t4, b4, ALPHA, a4 + gsSQC1(Y_BASE, T4, T3, 1) + gsLQC1(Y_BASE,B4,B3,5) + + + MADD t1, b5, ALPHA, a5 + MADD t2, b6, ALPHA, a6 + gsSQC1(Y_BASE, T2, T1, 2) + gsLQC1(Y_BASE,B6,B5,6) + + MADD t3, b7, ALPHA, a7 + MADD t4, b8, ALPHA, a8 + gsSQC1(Y_BASE, T4, T3, 3) + gsLQC1(Y_BASE,B8,B7,7) + + + MADD t1, b1, ALPHA, a9 + MADD t2, b2, ALPHA, a10 + gsSQC1(Y_BASE, T2, T1, 4) + + + MADD t3, b3, ALPHA, a11 + MADD t4, b4, ALPHA, a12 + gsSQC1(Y_BASE, T4, T3, 5) + + + MADD t1, b5, ALPHA, a13 + MADD t2, b6, ALPHA, a14 + gsSQC1(Y_BASE, T2, T1, 6) + + + MADD t3, b7, ALPHA, a15 + MADD t4, b8, ALPHA, a16 + gsSQC1(Y_BASE, T4, T3, 7) + + + daddiu X, X, 15 * SIZE + daddiu Y, Y, 16 * SIZE + + //jump back to the remain process. + b .L15 .align 5 //INCX!=1 or INCY != 1 -- 2.7.4