.L11:
//X & Y algin
- gsLQC1(X_BASE,A2,A1,0*SIZE)
- gsLQC1(X_BASE,A4,A3,2*SIZE)
- gsLQC1(X_BASE,A6,A5,4*SIZE)
- gsLQC1(X_BASE,A8,A7,6*SIZE)
-
- gsLQC1(X_BASE,A10,A9,8*SIZE)
- gsLQC1(X_BASE,A12,A11,10*SIZE)
- gsLQC1(X_BASE,A14,A13,12*SIZE)
- gsLQC1(X_BASE,A16,A15,14*SIZE)
-
- gsLQC1(Y_BASE,B2,B1,0*SIZE)
- gsLQC1(Y_BASE,B4,B3,2*SIZE)
- gsLQC1(Y_BASE,B6,B5,4*SIZE)
- gsLQC1(Y_BASE,B8,B7,6*SIZE)
+ gsLQC1(X_BASE,A2,A1,0)
+ gsLQC1(X_BASE,A4,A3,1)
+ gsLQC1(X_BASE,A6,A5,2)
+ gsLQC1(X_BASE,A8,A7,3)
+
+ gsLQC1(X_BASE,A10,A9,4)
+ gsLQC1(X_BASE,A12,A11,5)
+ gsLQC1(X_BASE,A14,A13,6)
+ gsLQC1(X_BASE,A16,A15,7)
+
+ gsLQC1(Y_BASE,B2,B1,0)
+ gsLQC1(Y_BASE,B4,B3,1)
+ gsLQC1(Y_BASE,B6,B5,2)
+ gsLQC1(Y_BASE,B8,B7,3)
blez I, .L13
NOP
MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
- gsSQC1(Y_BASE, T2, T1, 0*SIZE)
- gsLQC1(Y_BASE,B2,B1,8*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 0)
+ gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4
- gsSQC1(Y_BASE, T4, T3, 2*SIZE)
- gsLQC1(Y_BASE,B4,B3,10*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 1)
+ gsLQC1(Y_BASE,B4,B3,5)
PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6
- gsSQC1(Y_BASE, T2, T1, 4*SIZE)
- gsLQC1(Y_BASE,B6,B5,12*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 2)
+ gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8
- gsSQC1(Y_BASE, T4, T3, 6*SIZE)
- gsLQC1(Y_BASE,B8,B7,14*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 3)
+ gsLQC1(Y_BASE,B8,B7, 7)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
- gsSQC1(Y_BASE, T2, T1, 8*SIZE)
- gsLQC1(Y_BASE,B2,B1,16*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 4)
+ gsLQC1(Y_BASE,B2,B1,8)
MADD t3, b3, ALPHA, a11
MADD t4, b4, ALPHA, a12
- gsSQC1(Y_BASE, T4, T3, 10*SIZE)
- gsLQC1(Y_BASE,B4,B3,18*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 5)
+ gsLQC1(Y_BASE,B4,B3,9)
PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
- gsSQC1(Y_BASE, T2, T1, 12*SIZE)
- gsLQC1(Y_BASE,B6,B5,20*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 6)
+ gsLQC1(Y_BASE,B6,B5,10)
MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16
- gsSQC1(Y_BASE, T4, T3, 14*SIZE)
- gsLQC1(Y_BASE,B8,B7,22*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 7)
+ gsLQC1(Y_BASE,B8,B7,11)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
- gsLQC1(X_BASE,A2,A1,16*SIZE)
- gsLQC1(X_BASE,A4,A3,18*SIZE)
- gsLQC1(X_BASE,A6,A5,20*SIZE)
- gsLQC1(X_BASE,A8,A7,22*SIZE)
+ gsLQC1(X_BASE,A2,A1,8)
+ gsLQC1(X_BASE,A4,A3,9)
+ gsLQC1(X_BASE,A6,A5,10)
+ gsLQC1(X_BASE,A8,A7,11)
- gsLQC1(X_BASE,A10,A9,24*SIZE)
- gsLQC1(X_BASE,A12,A11,26*SIZE)
- gsLQC1(X_BASE,A14,A13,28*SIZE)
- gsLQC1(X_BASE,A16,A15,30*SIZE)
+ gsLQC1(X_BASE,A10,A9,12)
+ gsLQC1(X_BASE,A12,A11,13)
+ gsLQC1(X_BASE,A14,A13,14)
+ gsLQC1(X_BASE,A16,A15,15)
daddiu I, I, -1
MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
- gsSQC1(Y_BASE, T2, T1, 0*SIZE)
- gsLQC1(Y_BASE,B2,B1,8*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 0)
+ gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4
- gsSQC1(Y_BASE, T4, T3, 2*SIZE)
- gsLQC1(Y_BASE,B4,B3,10*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 1)
+ gsLQC1(Y_BASE,B4,B3,5)
MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6
- gsSQC1(Y_BASE, T2, T1, 4*SIZE)
- gsLQC1(Y_BASE,B6,B5,12*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 2)
+ gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8
- gsSQC1(Y_BASE, T4, T3, 6*SIZE)
- gsLQC1(Y_BASE,B8,B7,14*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 3)
+ gsLQC1(Y_BASE,B8,B7,7)
MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
- gsSQC1(Y_BASE, T2, T1, 8*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 4)
MADD t3, b3, ALPHA, a11
MADD t4, b4, ALPHA, a12
- gsSQC1(Y_BASE, T4, T3, 10*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 5)
MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
- gsSQC1(Y_BASE, T2, T1, 12*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 6)
MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16
- gsSQC1(Y_BASE, T4, T3, 14*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 7)
daddiu X, X, 16 * SIZE
//unloop 16
LD a1, 0 * SIZE(X)
- gsLQC1(X_BASE,A3,A2,1*SIZE)
- gsLQC1(X_BASE,A5,A4,3*SIZE)
- gsLQC1(X_BASE,A7,A6,5*SIZE)
- gsLQC1(X_BASE,A9,A8,7*SIZE)
-
- gsLQC1(X_BASE,A11,A10,8*SIZE)
- gsLQC1(X_BASE,A13,A12,11*SIZE)
- gsLQC1(X_BASE,A15,A14,13*SIZE)
- LD a16, 15 * SIZE(X)
+ daddiu X, X, SIZE
+ gsLQC1(X_BASE,A3,A2,0)
+ gsLQC1(X_BASE,A5,A4,1)
+ gsLQC1(X_BASE,A7,A6,2)
+ gsLQC1(X_BASE,A9,A8,3)
+
+ gsLQC1(X_BASE,A11,A10,4)
+ gsLQC1(X_BASE,A13,A12,5)
+ gsLQC1(X_BASE,A15,A14,6)
+ LD a16, 14 * SIZE(X)
+
- gsLQC1(Y_BASE,B2,B1,0*SIZE)
- gsLQC1(Y_BASE,B4,B3,2*SIZE)
- gsLQC1(Y_BASE,B6,B5,4*SIZE)
- gsLQC1(Y_BASE,B8,B7,6*SIZE)
+ gsLQC1(Y_BASE,B2,B1,0)
+ gsLQC1(Y_BASE,B4,B3,1)
+ gsLQC1(Y_BASE,B6,B5,2)
+ gsLQC1(Y_BASE,B8,B7,3)
- blez I, .L13
+ blez I, .L32
NOP
.align 5
.L31:
MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
- gsSQC1(Y_BASE, T2, T1, 0*SIZE)
- gsLQC1(Y_BASE,B2,B1,8*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 0)
+ gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4
- gsSQC1(Y_BASE, T4, T3, 2*SIZE)
- gsLQC1(Y_BASE,B4,B3,10*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 1)
+ gsLQC1(Y_BASE,B4,B3,5)
PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6
- gsSQC1(Y_BASE, T2, T1, 4*SIZE)
- gsLQC1(Y_BASE,B6,B5,12*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 2)
+ gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8
- gsSQC1(Y_BASE, T4, T3, 6*SIZE)
- gsLQC1(Y_BASE,B8,B7,14*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 3)
+ gsLQC1(Y_BASE,B8,B7,7)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
- gsSQC1(Y_BASE, T2, T1, 8*SIZE)
- gsLQC1(Y_BASE,B2,B1,16*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 4)
+ gsLQC1(Y_BASE,B2,B1,8)
MADD t3, b3, ALPHA, a11
MADD t4, b4, ALPHA, a12
- gsSQC1(Y_BASE, T4, T3, 10*SIZE)
- gsLQC1(Y_BASE,B4,B3,18*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 5)
+ gsLQC1(Y_BASE,B4,B3,9)
PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
- gsSQC1(Y_BASE, T2, T1, 12*SIZE)
- gsLQC1(Y_BASE,B6,B5,20*SIZE)
+ gsSQC1(Y_BASE, T2, T1, 6)
+ gsLQC1(Y_BASE,B6,B5,10)
MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16
- gsSQC1(Y_BASE, T4, T3, 14*SIZE)
- gsLQC1(Y_BASE,B8,B7,22*SIZE)
+ gsSQC1(Y_BASE, T4, T3, 7)
+ gsLQC1(Y_BASE,B8,B7,11)
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
- LD a1, 16 * SIZE(X)
- gsLQC1(X_BASE,A3,A2,17*SIZE)
- gsLQC1(X_BASE,A5,A4,19*SIZE)
- gsLQC1(X_BASE,A7,A6,21*SIZE)
- gsLQC1(X_BASE,A9,A8,23*SIZE)
+ LD a1, 15 * SIZE(X)
+ gsLQC1(X_BASE,A3,A2,8)
+ gsLQC1(X_BASE,A5,A4,9)
+ gsLQC1(X_BASE,A7,A6,10)
+ gsLQC1(X_BASE,A9,A8,11)
- gsLQC1(X_BASE,A11,A10,25*SIZE)
- gsLQC1(X_BASE,A13,A12,27*SIZE)
- gsLQC1(X_BASE,A15,A14,29*SIZE)
- LD a16, 31 * SIZE(X)
+ gsLQC1(X_BASE,A11,A10,12)
+ gsLQC1(X_BASE,A13,A12,13)
+ gsLQC1(X_BASE,A15,A14,14)
+ LD a16, 30 * SIZE(X)
daddiu I, I, -1
daddiu Y, Y, 16 * SIZE
daddiu X, X, 16 * SIZE
bgtz I, .L31
-
- //jump back to the remain loop process.
- b .L13
+
+ .align 5
+//Loop end:
+.L32:
+
+ MADD t1, b1, ALPHA, a1
+ MADD t2, b2, ALPHA, a2
+ gsSQC1(Y_BASE, T2, T1, 0)
+ gsLQC1(Y_BASE,B2,B1,4)
+
+ MADD t3, b3, ALPHA, a3
+ MADD t4, b4, ALPHA, a4
+ gsSQC1(Y_BASE, T4, T3, 1)
+ gsLQC1(Y_BASE,B4,B3,5)
+
+
+ MADD t1, b5, ALPHA, a5
+ MADD t2, b6, ALPHA, a6
+ gsSQC1(Y_BASE, T2, T1, 2)
+ gsLQC1(Y_BASE,B6,B5,6)
+
+ MADD t3, b7, ALPHA, a7
+ MADD t4, b8, ALPHA, a8
+ gsSQC1(Y_BASE, T4, T3, 3)
+ gsLQC1(Y_BASE,B8,B7,7)
+
+
+ MADD t1, b1, ALPHA, a9
+ MADD t2, b2, ALPHA, a10
+ gsSQC1(Y_BASE, T2, T1, 4)
+
+
+ MADD t3, b3, ALPHA, a11
+ MADD t4, b4, ALPHA, a12
+ gsSQC1(Y_BASE, T4, T3, 5)
+
+
+ MADD t1, b5, ALPHA, a13
+ MADD t2, b6, ALPHA, a14
+ gsSQC1(Y_BASE, T2, T1, 6)
+
+
+ MADD t3, b7, ALPHA, a15
+ MADD t4, b8, ALPHA, a16
+ gsSQC1(Y_BASE, T4, T3, 7)
+
+
+ daddiu X, X, 15 * SIZE
+ daddiu Y, Y, 16 * SIZE
+
+ //jump back to the remain process.
+ b .L15
.align 5
//INCX!=1 or INCY != 1