Fixed the bug about Loongson3A gsLQC1 & gsSQC1 instructions in daxpy kernel. Now...
authorXianyi Zhang <traits.zhang@gmail.com>
Fri, 18 Mar 2011 23:05:56 +0000 (23:05 +0000)
committerXianyi Zhang <traits.zhang@gmail.com>
Fri, 18 Mar 2011 23:05:56 +0000 (23:05 +0000)
kernel/mips64/daxpy_loongson3a_simd.S

index 9a0b8f1..8f53441 100644 (file)
@@ -228,20 +228,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .L11:
        //X & Y algin
-       gsLQC1(X_BASE,A2,A1,0*SIZE)
-       gsLQC1(X_BASE,A4,A3,2*SIZE)
-       gsLQC1(X_BASE,A6,A5,4*SIZE)
-       gsLQC1(X_BASE,A8,A7,6*SIZE)
-
-       gsLQC1(X_BASE,A10,A9,8*SIZE)
-       gsLQC1(X_BASE,A12,A11,10*SIZE)
-       gsLQC1(X_BASE,A14,A13,12*SIZE)
-       gsLQC1(X_BASE,A16,A15,14*SIZE)
-
-       gsLQC1(Y_BASE,B2,B1,0*SIZE)
-       gsLQC1(Y_BASE,B4,B3,2*SIZE)
-       gsLQC1(Y_BASE,B6,B5,4*SIZE)
-       gsLQC1(Y_BASE,B8,B7,6*SIZE)
+       gsLQC1(X_BASE,A2,A1,0)
+       gsLQC1(X_BASE,A4,A3,1)
+       gsLQC1(X_BASE,A6,A5,2)
+       gsLQC1(X_BASE,A8,A7,3)
+
+       gsLQC1(X_BASE,A10,A9,4)
+       gsLQC1(X_BASE,A12,A11,5)
+       gsLQC1(X_BASE,A14,A13,6)
+       gsLQC1(X_BASE,A16,A15,7)
+
+       gsLQC1(Y_BASE,B2,B1,0)
+       gsLQC1(Y_BASE,B4,B3,1)
+       gsLQC1(Y_BASE,B6,B5,2)
+       gsLQC1(Y_BASE,B8,B7,3)
                
        blez    I, .L13
        NOP
@@ -251,65 +251,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                
        MADD    t1, b1, ALPHA, a1       
        MADD    t2, b2, ALPHA, a2
-       gsSQC1(Y_BASE, T2, T1, 0*SIZE)          
-       gsLQC1(Y_BASE,B2,B1,8*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 0)               
+       gsLQC1(Y_BASE,B2,B1,4)
 
        MADD    t3, b3, ALPHA, a3
        MADD    t4, b4, ALPHA, a4
-       gsSQC1(Y_BASE, T4, T3, 2*SIZE)
-       gsLQC1(Y_BASE,B4,B3,10*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 1)
+       gsLQC1(Y_BASE,B4,B3,5)
 
        PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
        PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
 
        MADD    t1, b5, ALPHA, a5
        MADD    t2, b6, ALPHA, a6
-       gsSQC1(Y_BASE, T2, T1, 4*SIZE)          
-       gsLQC1(Y_BASE,B6,B5,12*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 2)               
+       gsLQC1(Y_BASE,B6,B5,6)
 
        MADD    t3, b7, ALPHA, a7
        MADD    t4, b8, ALPHA, a8
-       gsSQC1(Y_BASE, T4, T3, 6*SIZE)
-       gsLQC1(Y_BASE,B8,B7,14*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 3)
+       gsLQC1(Y_BASE,B8,B7, 7)
 
        PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
        PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
 
        MADD    t1, b1, ALPHA, a9       
        MADD    t2, b2, ALPHA, a10
-       gsSQC1(Y_BASE, T2, T1, 8*SIZE)          
-       gsLQC1(Y_BASE,B2,B1,16*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 4)               
+       gsLQC1(Y_BASE,B2,B1,8)
 
        MADD    t3, b3, ALPHA, a11
        MADD    t4, b4, ALPHA, a12
-       gsSQC1(Y_BASE, T4, T3, 10*SIZE)
-       gsLQC1(Y_BASE,B4,B3,18*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 5)
+       gsLQC1(Y_BASE,B4,B3,9)
 
        PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
        PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
 
        MADD    t1, b5, ALPHA, a13      
        MADD    t2, b6, ALPHA, a14
-       gsSQC1(Y_BASE, T2, T1, 12*SIZE)         
-       gsLQC1(Y_BASE,B6,B5,20*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 6)               
+       gsLQC1(Y_BASE,B6,B5,10)
 
        MADD    t3, b7, ALPHA, a15
        MADD    t4, b8, ALPHA, a16
-       gsSQC1(Y_BASE, T4, T3, 14*SIZE)
-       gsLQC1(Y_BASE,B8,B7,22*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 7)
+       gsLQC1(Y_BASE,B8,B7,11)
                
        PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
        PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
 
-       gsLQC1(X_BASE,A2,A1,16*SIZE)
-       gsLQC1(X_BASE,A4,A3,18*SIZE)
-       gsLQC1(X_BASE,A6,A5,20*SIZE)
-       gsLQC1(X_BASE,A8,A7,22*SIZE)
+       gsLQC1(X_BASE,A2,A1,8)
+       gsLQC1(X_BASE,A4,A3,9)
+       gsLQC1(X_BASE,A6,A5,10)
+       gsLQC1(X_BASE,A8,A7,11)
 
-       gsLQC1(X_BASE,A10,A9,24*SIZE)
-       gsLQC1(X_BASE,A12,A11,26*SIZE)
-       gsLQC1(X_BASE,A14,A13,28*SIZE)
-       gsLQC1(X_BASE,A16,A15,30*SIZE)
+       gsLQC1(X_BASE,A10,A9,12)
+       gsLQC1(X_BASE,A12,A11,13)
+       gsLQC1(X_BASE,A14,A13,14)
+       gsLQC1(X_BASE,A16,A15,15)
 
 
        daddiu  I, I, -1
@@ -324,44 +324,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        MADD    t1, b1, ALPHA, a1       
        MADD    t2, b2, ALPHA, a2
-       gsSQC1(Y_BASE, T2, T1, 0*SIZE)          
-       gsLQC1(Y_BASE,B2,B1,8*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 0)
+       gsLQC1(Y_BASE,B2,B1,4)
 
        MADD    t3, b3, ALPHA, a3
        MADD    t4, b4, ALPHA, a4
-       gsSQC1(Y_BASE, T4, T3, 2*SIZE)
-       gsLQC1(Y_BASE,B4,B3,10*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 1)
+       gsLQC1(Y_BASE,B4,B3,5)
 
 
        MADD    t1, b5, ALPHA, a5
        MADD    t2, b6, ALPHA, a6
-       gsSQC1(Y_BASE, T2, T1, 4*SIZE)          
-       gsLQC1(Y_BASE,B6,B5,12*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 2)
+       gsLQC1(Y_BASE,B6,B5,6)
 
        MADD    t3, b7, ALPHA, a7
        MADD    t4, b8, ALPHA, a8
-       gsSQC1(Y_BASE, T4, T3, 6*SIZE)
-       gsLQC1(Y_BASE,B8,B7,14*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 3)
+       gsLQC1(Y_BASE,B8,B7,7)
 
 
        MADD    t1, b1, ALPHA, a9       
        MADD    t2, b2, ALPHA, a10
-       gsSQC1(Y_BASE, T2, T1, 8*SIZE)          
+       gsSQC1(Y_BASE, T2, T1, 4)
 
 
        MADD    t3, b3, ALPHA, a11
        MADD    t4, b4, ALPHA, a12
-       gsSQC1(Y_BASE, T4, T3, 10*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 5)
 
 
        MADD    t1, b5, ALPHA, a13      
        MADD    t2, b6, ALPHA, a14
-       gsSQC1(Y_BASE, T2, T1, 12*SIZE)         
+       gsSQC1(Y_BASE, T2, T1, 6)
 
 
        MADD    t3, b7, ALPHA, a15
        MADD    t4, b8, ALPHA, a16
-       gsSQC1(Y_BASE, T4, T3, 14*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 7)
 
 
        daddiu  X, X, 16 * SIZE
@@ -415,97 +415,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        //unloop 16
        
        LD      a1,  0 * SIZE(X)
-       gsLQC1(X_BASE,A3,A2,1*SIZE)
-       gsLQC1(X_BASE,A5,A4,3*SIZE)
-       gsLQC1(X_BASE,A7,A6,5*SIZE)
-       gsLQC1(X_BASE,A9,A8,7*SIZE)
-
-       gsLQC1(X_BASE,A11,A10,8*SIZE)
-       gsLQC1(X_BASE,A13,A12,11*SIZE)
-       gsLQC1(X_BASE,A15,A14,13*SIZE)
-       LD      a16,  15 * SIZE(X)
+       daddiu  X, X, SIZE
+       gsLQC1(X_BASE,A3,A2,0)
+       gsLQC1(X_BASE,A5,A4,1)
+       gsLQC1(X_BASE,A7,A6,2)
+       gsLQC1(X_BASE,A9,A8,3)
+
+       gsLQC1(X_BASE,A11,A10,4)
+       gsLQC1(X_BASE,A13,A12,5)
+       gsLQC1(X_BASE,A15,A14,6)
+       LD      a16,  14 * SIZE(X)
+
                
-       gsLQC1(Y_BASE,B2,B1,0*SIZE)
-       gsLQC1(Y_BASE,B4,B3,2*SIZE)
-       gsLQC1(Y_BASE,B6,B5,4*SIZE)
-       gsLQC1(Y_BASE,B8,B7,6*SIZE)
+       gsLQC1(Y_BASE,B2,B1,0)
+       gsLQC1(Y_BASE,B4,B3,1)
+       gsLQC1(Y_BASE,B6,B5,2)
+       gsLQC1(Y_BASE,B8,B7,3)
                
-       blez    I, .L13
+       blez    I, .L32
        NOP
        .align 5
        
 .L31:
        MADD    t1, b1, ALPHA, a1       
        MADD    t2, b2, ALPHA, a2
-       gsSQC1(Y_BASE, T2, T1, 0*SIZE)          
-       gsLQC1(Y_BASE,B2,B1,8*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 0)
+       gsLQC1(Y_BASE,B2,B1,4)
 
        MADD    t3, b3, ALPHA, a3
        MADD    t4, b4, ALPHA, a4
-       gsSQC1(Y_BASE, T4, T3, 2*SIZE)
-       gsLQC1(Y_BASE,B4,B3,10*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 1)
+       gsLQC1(Y_BASE,B4,B3,5)
 
        PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
        PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
 
        MADD    t1, b5, ALPHA, a5
        MADD    t2, b6, ALPHA, a6
-       gsSQC1(Y_BASE, T2, T1, 4*SIZE)          
-       gsLQC1(Y_BASE,B6,B5,12*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 2)
+       gsLQC1(Y_BASE,B6,B5,6)
 
        MADD    t3, b7, ALPHA, a7
        MADD    t4, b8, ALPHA, a8
-       gsSQC1(Y_BASE, T4, T3, 6*SIZE)
-       gsLQC1(Y_BASE,B8,B7,14*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 3)
+       gsLQC1(Y_BASE,B8,B7,7)
 
        PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
        PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
 
        MADD    t1, b1, ALPHA, a9       
        MADD    t2, b2, ALPHA, a10
-       gsSQC1(Y_BASE, T2, T1, 8*SIZE)          
-       gsLQC1(Y_BASE,B2,B1,16*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 4)
+       gsLQC1(Y_BASE,B2,B1,8)
 
        MADD    t3, b3, ALPHA, a11
        MADD    t4, b4, ALPHA, a12
-       gsSQC1(Y_BASE, T4, T3, 10*SIZE)
-       gsLQC1(Y_BASE,B4,B3,18*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 5)
+       gsLQC1(Y_BASE,B4,B3,9)
 
        PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
        PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
 
        MADD    t1, b5, ALPHA, a13      
        MADD    t2, b6, ALPHA, a14
-       gsSQC1(Y_BASE, T2, T1, 12*SIZE)         
-       gsLQC1(Y_BASE,B6,B5,20*SIZE)
+       gsSQC1(Y_BASE, T2, T1, 6)
+       gsLQC1(Y_BASE,B6,B5,10)
 
        MADD    t3, b7, ALPHA, a15
        MADD    t4, b8, ALPHA, a16
-       gsSQC1(Y_BASE, T4, T3, 14*SIZE)
-       gsLQC1(Y_BASE,B8,B7,22*SIZE)
+       gsSQC1(Y_BASE, T4, T3, 7)
+       gsLQC1(Y_BASE,B8,B7,11)
                
        PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
        PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
 
-       LD      a1,  16 * SIZE(X)
-       gsLQC1(X_BASE,A3,A2,17*SIZE)
-       gsLQC1(X_BASE,A5,A4,19*SIZE)
-       gsLQC1(X_BASE,A7,A6,21*SIZE)
-       gsLQC1(X_BASE,A9,A8,23*SIZE)
+       LD      a1,  15 * SIZE(X)
+       gsLQC1(X_BASE,A3,A2,8)
+       gsLQC1(X_BASE,A5,A4,9)
+       gsLQC1(X_BASE,A7,A6,10)
+       gsLQC1(X_BASE,A9,A8,11)
 
-       gsLQC1(X_BASE,A11,A10,25*SIZE)
-       gsLQC1(X_BASE,A13,A12,27*SIZE)
-       gsLQC1(X_BASE,A15,A14,29*SIZE)
-       LD      a16,  31 * SIZE(X)
+       gsLQC1(X_BASE,A11,A10,12)
+       gsLQC1(X_BASE,A13,A12,13)
+       gsLQC1(X_BASE,A15,A14,14)
+       LD      a16,  30 * SIZE(X)
 
        daddiu  I, I, -1
        daddiu  Y, Y, 16 * SIZE
                
        daddiu  X, X, 16 * SIZE
        bgtz    I, .L31
-               
-       //jump back to the remain loop process.
-       b       .L13
+       
+       .align 5
+//Loop end:
+.L32:
+       
+       MADD    t1, b1, ALPHA, a1       
+       MADD    t2, b2, ALPHA, a2
+       gsSQC1(Y_BASE, T2, T1, 0)
+       gsLQC1(Y_BASE,B2,B1,4)
+
+       MADD    t3, b3, ALPHA, a3
+       MADD    t4, b4, ALPHA, a4
+       gsSQC1(Y_BASE, T4, T3, 1)
+       gsLQC1(Y_BASE,B4,B3,5)
+
+
+       MADD    t1, b5, ALPHA, a5
+       MADD    t2, b6, ALPHA, a6
+       gsSQC1(Y_BASE, T2, T1, 2)
+       gsLQC1(Y_BASE,B6,B5,6)
+
+       MADD    t3, b7, ALPHA, a7
+       MADD    t4, b8, ALPHA, a8
+       gsSQC1(Y_BASE, T4, T3, 3)
+       gsLQC1(Y_BASE,B8,B7,7)
+
+
+       MADD    t1, b1, ALPHA, a9       
+       MADD    t2, b2, ALPHA, a10
+       gsSQC1(Y_BASE, T2, T1, 4)
+
+
+       MADD    t3, b3, ALPHA, a11
+       MADD    t4, b4, ALPHA, a12
+       gsSQC1(Y_BASE, T4, T3, 5)
+
+
+       MADD    t1, b5, ALPHA, a13      
+       MADD    t2, b6, ALPHA, a14
+       gsSQC1(Y_BASE, T2, T1, 6)
+
+
+       MADD    t3, b7, ALPHA, a15
+       MADD    t4, b8, ALPHA, a16
+       gsSQC1(Y_BASE, T4, T3, 7)
+
+
+       daddiu  X, X, 15 * SIZE
+       daddiu  Y, Y, 16 * SIZE
+
+       //jump back to the remain process.
+       b       .L15
        .align 5
        
 //INCX!=1 or INCY != 1