From: Xianyi Zhang Date: Sat, 5 Mar 2011 02:17:10 +0000 (+0800) Subject: Support unalign address in daxpy on loongson3a simd.. X-Git-Tag: v0.1alpha1~2^2~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5838f129954fbed873162b66aeae1bb4b9730470;p=platform%2Fupstream%2Fopenblas.git Support unalign address in daxpy on loongson3a simd.. --- diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index 543f1cef..9a0b8f17 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -72,7 +72,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define PREFETCH_DISTANCE 1864 +#define PREFETCH_DISTANCE 2016 #define N $4 @@ -195,11 +195,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dsll INCY, INCY, BASE_SHIFT bne INCY, TEMP, .L20 + + //Dose the address of Y algin 16 bytes? + andi TEMP, Y, 8 + beq TEMP, $0, .L10 + //Y unalgin. Compute this unalgined element. + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + daddiu X, X, SIZE + daddiu Y, Y, SIZE + + MADD t1, b1, ALPHA, a1 + daddiu N, N, -1 + + ST t1, -1 * SIZE(Y) + blez N, .L999 + .align 5 + +.L10: + dsra I, N, 4 blez I, .L15 daddiu I, I, -1 + + //Y algin. We need test X address + //Dose the address of X algin 16 bytes? + andi TEMP, X, 8 + bne TEMP, $0, .L30 /// + .align 5 +.L11: + //X & Y algin gsLQC1(X_BASE,A2,A1,0*SIZE) gsLQC1(X_BASE,A4,A3,2*SIZE) gsLQC1(X_BASE,A6,A5,4*SIZE) @@ -345,7 +373,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blez I, .L999 NOP - .align 3 + .align 5 .L16: LD a1, 0 * SIZE(X) @@ -382,6 +410,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. NOP .align 5 +.L30: + //Y align, X unalign, INCX==INCY==1 + //unloop 16 + + LD a1, 0 * SIZE(X) + gsLQC1(X_BASE,A3,A2,1*SIZE) + gsLQC1(X_BASE,A5,A4,3*SIZE) + gsLQC1(X_BASE,A7,A6,5*SIZE) + gsLQC1(X_BASE,A9,A8,7*SIZE) + + gsLQC1(X_BASE,A11,A10,8*SIZE) + gsLQC1(X_BASE,A13,A12,11*SIZE) + gsLQC1(X_BASE,A15,A14,13*SIZE) + LD a16, 15 * SIZE(X) + + gsLQC1(Y_BASE,B2,B1,0*SIZE) + gsLQC1(Y_BASE,B4,B3,2*SIZE) + gsLQC1(Y_BASE,B6,B5,4*SIZE) + gsLQC1(Y_BASE,B8,B7,6*SIZE) + + blez I, .L13 + NOP + .align 5 + +.L31: + MADD t1, b1, ALPHA, a1 + MADD t2, b2, ALPHA, a2 + gsSQC1(Y_BASE, T2, T1, 0*SIZE) + gsLQC1(Y_BASE,B2,B1,8*SIZE) + + MADD t3, b3, ALPHA, a3 + MADD t4, b4, ALPHA, a4 + gsSQC1(Y_BASE, T4, T3, 2*SIZE) + gsLQC1(Y_BASE,B4,B3,10*SIZE) + + PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) + PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) + + MADD t1, b5, ALPHA, a5 + MADD t2, b6, ALPHA, a6 + gsSQC1(Y_BASE, T2, T1, 4*SIZE) + gsLQC1(Y_BASE,B6,B5,12*SIZE) + + MADD t3, b7, ALPHA, a7 + MADD t4, b8, ALPHA, a8 + gsSQC1(Y_BASE, T4, T3, 6*SIZE) + gsLQC1(Y_BASE,B8,B7,14*SIZE) + + PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) + PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) + + MADD t1, b1, ALPHA, a9 + MADD t2, b2, ALPHA, a10 + gsSQC1(Y_BASE, T2, T1, 8*SIZE) + gsLQC1(Y_BASE,B2,B1,16*SIZE) + + MADD t3, b3, ALPHA, a11 + MADD t4, b4, ALPHA, a12 + gsSQC1(Y_BASE, T4, T3, 10*SIZE) + gsLQC1(Y_BASE,B4,B3,18*SIZE) + + PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) + PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) + + MADD t1, b5, ALPHA, a13 + MADD t2, b6, ALPHA, a14 + gsSQC1(Y_BASE, T2, T1, 12*SIZE) + gsLQC1(Y_BASE,B6,B5,20*SIZE) + + MADD t3, b7, ALPHA, a15 + MADD t4, b8, ALPHA, a16 + gsSQC1(Y_BASE, T4, T3, 14*SIZE) + gsLQC1(Y_BASE,B8,B7,22*SIZE) + + PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) + PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) + + LD a1, 16 * SIZE(X) + gsLQC1(X_BASE,A3,A2,17*SIZE) + gsLQC1(X_BASE,A5,A4,19*SIZE) + gsLQC1(X_BASE,A7,A6,21*SIZE) + gsLQC1(X_BASE,A9,A8,23*SIZE) + + gsLQC1(X_BASE,A11,A10,25*SIZE) + gsLQC1(X_BASE,A13,A12,27*SIZE) + gsLQC1(X_BASE,A15,A14,29*SIZE) + LD a16, 31 * SIZE(X) + + daddiu I, I, -1 + daddiu Y, Y, 16 * SIZE + + daddiu X, X, 16 * SIZE + bgtz I, .L31 + + //jump back to the remain loop process. + b .L13 + .align 5 + +//INCX!=1 or INCY != 1 .L20: dsra I, N, 3 move YY, Y @@ -538,7 +665,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blez I, .L999 NOP - .align 3 + .align 5 .L26: LD a1, 0 * SIZE(X)