#define ASSEMBLER
#include "common.h"
-
#define FETCH ld
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
-
#define STACKSIZE 160
#define M $4
#define N $5
## MADD3 a*d
## MADD4 d*b
##################################
-####if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
-###endif
+#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD1 MADD
dsra J, N, 1 # J=N/2
ST ALPHA_R, 128($sp) # store alpha_r & alpha_i
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK, OFFSET
+#endif
dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE
blez J, .L20
.align 5
.L10:
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+
daddiu J, J, -1
dsra I, M, 1 # I=M/2
daddu CO2, C, LDC
move AO, A # Reset AO
- daddu PREB, PREB, B # PREA=A+panel size
-
blez I, .L30
daddu PREA, PREA, A # PREA=A+panel size
.L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2
+ dsll TEMP, KK, 1 + ZBASE_SHIFT
+
+ daddu AO, AO, L
+ daddu BO, B, TEMP
+#endif
+ MTC $0, c11 # Clear results regs
+ MOV c12, c11
+ gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
+
+ MOV c13, c11
+ MOV c14, c11
+ gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
+
+ MOV c21, c11
+ MOV c22, c11
+ gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
+
+ MOV c23, c11
+ MOV c24, c11
+ gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
+
+ FETCH $0, 0 * SIZE(CO2)
+ MOV c31, c11
+ MOV c32, c11
+
+ FETCH $0, 0 * SIZE(CO1)
+ MOV c33, c11
+ MOV c34, c11
+
+ FETCH $0, 4 * SIZE(CO2)
+ MOV c41, c11
+ MOV c42, c11
+
+ FETCH $0, 4 * SIZE(CO1)
+ MOV c43, c11
+ MOV c44, c11
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, K, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra L, TEMP, 2
+ daddu PREB, PREB, B # PREA=A+panel size
+ blez L, .L15
+ NOP
+
+#else
+
dsra L, K, 2 # Unroll K 4 times
move BO, B
MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
+ FETCH $0, 0 * SIZE(CO2)
MOV c31, c11
MOV c32, c11
+ FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
+ FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
+ FETCH $0, 4 * SIZE(CO1)
MOV c43, c11
+
+ daddu PREB, PREB, B # PREA=A+panel size
blez L, .L15
MOV c44, c11
+#endif
.align 5
.align 5
.L15:
+#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
+#else
+ andi L, TEMP, 3
+ LD ALPHA_R, 128($sp)
+#endif
blez L, .L18
LD ALPHA_I, 136($sp)
NOP
.L18:
-
+#ifndef TRMMKERNEL
ADD c11, c14, c11
LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
ST b3, 2 * SIZE(CO2)
ST b4, 3 * SIZE(CO2)
- FETCH $0, 4 * SIZE(CO2)
- FETCH $0, 4 * SIZE(CO1)
- FETCH $0, 8 * SIZE(CO2)
- FETCH $0, 8 * SIZE(CO1)
- FETCH $0, 12 * SIZE(CO2)
- FETCH $0, 12 * SIZE(CO1)
- FETCH $0, 16 * SIZE(CO2)
- FETCH $0, 16 * SIZE(CO1)
+#else
+ ADD c11, c14, c11
+ ADD c12, c13, c12
+ ADD c21, c24, c21
+ ADD c22, c23, c22
+ ADD c31, c34, c31
+ ADD c32, c33, c32
+ ADD c41, c44, c41
+ ADD c42, c43, c42
+
+ daddiu I, I, -1
+ MUL a1, ALPHA_R, c11
+ MUL a2, ALPHA_R, c12
+ MUL b1, ALPHA_R, c21
+ MUL b2, ALPHA_R, c22
+
+ NMSUB a1, a1, ALPHA_I, c12
+ MADD a2, a2, ALPHA_I, c11
+ NMSUB b1, b1, ALPHA_I, c22
+ MADD b2, b2, ALPHA_I, c21
+
+ MUL a3, ALPHA_R, c31
+ MUL a4, ALPHA_R, c32
+ MUL b3, ALPHA_R, c41
+ MUL b4, ALPHA_R, c42
+
+ NMSUB a3, a3, ALPHA_I, c32
+ MADD a4, a4, ALPHA_I, c31
+ NMSUB b3, b3, ALPHA_I, c42
+ MADD b4, b4, ALPHA_I, c41
+
+ ST a1, 0 * SIZE(CO1)
+ ST a2, 1 * SIZE(CO1)
+ ST b1, 2 * SIZE(CO1)
+ ST b2, 3 * SIZE(CO1)
+
+ ST a3, 0 * SIZE(CO2)
+ ST a4, 1 * SIZE(CO2)
+ ST b3, 2 * SIZE(CO2)
+ ST b4, 3 * SIZE(CO2)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, K, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+
+ dsll L, TEMP, 1 + ZBASE_SHIFT
+ dsll TEMP, TEMP, 1 + ZBASE_SHIFT
+
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+ dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11
daddiu CO2,CO2, 4 * SIZE
-
+ .align 5
.L30:
andi I, M, 1
daddu C, C, LDC # Change C to next panel
blez I, .L19
daddu C, C, LDC # Change C to next panel
- dsra L, K, 2 # Unroll K 4 times
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
+#else
+ dsll L, KK, ZBASE_SHIFT # MR=1
+ dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
+
+ daddu AO, AO, L
+ daddu BO, B, TEMP
+#endif
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs
MOV c32, c11
FETCH $0, 0 * SIZE(PREB)
+ MOV c33, c11
+ MOV c34, c11
+
+ FETCH $0, 0 * SIZE(CO1)
+ FETCH $0, 0 * SIZE(CO2)
+ FETCH $0, 4 * SIZE(CO1)
+ FETCH $0, 4 * SIZE(CO2)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, K, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1 # MR=1
+#else
+ daddiu TEMP, KK, 2 # NR=2
+#endif
+ dsra L, TEMP, 2
+ blez L, .L35
+ NOP
+
+#else
+
+ gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
+ dsra L, K, 2 # Unroll K 4 times
+ move BO, B
+
+ gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
+ MTC $0, c11 # Clear results regs
+ MOV c12, c11
+
+ gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
+ MOV c13, c11
+ MOV c14, c11
+
+ FETCH $0, 0 * SIZE(PREB)
+ MOV c31, c11
+ MOV c32, c11
+
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 4 * SIZE(CO1)
MOV c33, c11
blez L, .L35
MOV c34, c11
+#endif
.align 5
.L35:
+#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
- NOP
+#else
+ andi L, TEMP, 3
+ LD ALPHA_R, 128($sp)
+#endif
blez L, .L38
LD ALPHA_I, 136($sp)
.align 5
.L36:
-
daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
.L38:
+#ifndef TRMMKERNEL
ADD c11, c14, c11
LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
+#else
+ ADD c11, c14, c11
+ ADD c12, c13, c12
+
+ ADD c31, c34, c31
+ ADD c32, c33, c32
+
+ MUL a1, ALPHA_R, c11
+ MUL a2, ALPHA_R, c12
+ MUL a3, ALPHA_R, c31
+ MUL a4, ALPHA_R, c32
+
+ NMSUB a1, a1, ALPHA_I, c12
+ MADD a2, a2, ALPHA_I, c11
+
+ NMSUB a3, a3, ALPHA_I, c32
+ MADD a4, a4, ALPHA_I, c31
+
+ ST a1, 0 * SIZE(CO1)
+ ST a2, 1 * SIZE(CO1)
+
+ ST a3, 0 * SIZE(CO2)
+ ST a4, 1 * SIZE(CO2)
+
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, K, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+ dsll L, TEMP, ZBASE_SHIFT
+ dsll TEMP, TEMP, 1 + ZBASE_SHIFT
+
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+#endif
.align 5
.L19:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ daddiu KK, KK, 2
+#endif
+
bgtz J, .L10
move B, BO
dsra I, M, 1 # I=M/2
move CO1, C
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+
move AO, A # Reset AO
blez I, .L29
daddu PREA, PREA, A
.L21:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ dsll L, KK, 1 + ZBASE_SHIFT
+ dsll TEMP, KK, ZBASE_SHIFT
+
+ daddu AO, AO, L
+ daddu BO, B, TEMP
+#endif
+ gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
+ MTC $0, c11 # Clear results regs
+ MOV c12, c11
+
+ gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
+ MOV c13, c11
+ MOV c14, c11
+
+ gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
+ MOV c21, c11
+ MOV c22, c11
+
+ FETCH $0, 0 * SIZE(PREA)
+ MOV c23, c11
+ MOV c24, c11
+
+ FETCH $0, 0 * SIZE(CO1)
+ FETCH $0, 4 * SIZE(CO1)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, K, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2 # define Mr=2
+#else
+ daddiu TEMP, KK, 1 # define NR=1
+#endif
+ dsra L, TEMP, 2
+ blez L, .L25
+ NOP
+
+#else
dsra L, K, 2 # Unroll K 4 times
move BO, B
blez L, .L25
NOP
+#endif
- .align 3
+ .align 5
.L22:
gsLQC1(R12, F9, F8, 2) # Unroll K=1
.L25:
+#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
-
+#else
+ andi L, TEMP, 3
+ LD ALPHA_R, 128($sp)
+#endif
blez L, .L28
LD ALPHA_I, 136($sp)
.align 3
.L26:
-
daddiu L, L, -1
MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
FETCH $0, 0 * SIZE(PREA)
.L28:
+#ifndef TRMMKERNEL
ADD c11, c14, c11
LD a1, 0 * SIZE(CO1)
ADD c12, c13, c12
ST b1, 2 * SIZE(CO1)
ST b2, 3 * SIZE(CO1)
+#else
+ ADD c11, c14, c11
+ ADD c12, c13, c12
+ ADD c21, c24, c21
+ ADD c22, c23, c22
+
+ daddiu I, I, -1
+ MUL a1, ALPHA_R, c11
+ MUL a2, ALPHA_R, c12
+ MUL b1, ALPHA_R, c21
+ MUL b2, ALPHA_R, c22
+
+ NMSUB a1, a1, ALPHA_I, c12
+ MADD a2, a2, ALPHA_I, c11
+ NMSUB b1, b1, ALPHA_I, c22
+ MADD b2, b2, ALPHA_I, c21
+
+ ST a1, 0 * SIZE(CO1)
+ ST a2, 1 * SIZE(CO1)
+ ST b1, 2 * SIZE(CO1)
+ ST b2, 3 * SIZE(CO1)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, K, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll L, TEMP, 1 + ZBASE_SHIFT
+ dsll TEMP, TEMP, ZBASE_SHIFT
+
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
daddiu CO1,CO1, 4 * SIZE
bgtz I, .L21
NOP
blez I, .L999
NOP
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ dsll TEMP, KK, ZBASE_SHIFT
+
+ daddu AO, AO, TEMP
+ daddu BO, B, TEMP
+#endif
+
+ gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
+ MTC $0, c11 # Clear results regs
+ MOV c12, c11
+
+ gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
+ MOV c13, c11
+ MOV c14, c11
+
+ FETCH $0, 0 * SIZE(PREA)
+ FETCH $0, 4 * SIZE(PREA)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, K, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra L, TEMP, 2
+ blez L, .L45
+ NOP
+
+#else
dsra L, K, 2 # Unroll K 4 times
move BO, B
FETCH $0, 4 * SIZE(PREA)
blez L, .L45
NOP
+#endif
.align 3
.align 5
.L45:
+#ifndef TRMMKERNEL
andi L, K, 3
LD ALPHA_R, 128($sp)
+#else
+ andi L, TEMP, 3
+ LD ALPHA_R, 128($sp)
+#endif
blez L, .L48
LD ALPHA_I, 136($sp)
NOP
.L48:
+#ifndef TRMMKERNEL
ADD c11, c14, c11
ADD c12, c13, c12
ST a1, 0 * SIZE(CO1)
ST a2, 1 * SIZE(CO1)
+#else
+ ADD c11, c14, c11
+ ADD c12, c13, c12
+
+ MUL a1, ALPHA_R, c11
+ MUL a2, ALPHA_R, c12
+
+ NMSUB a1, a1, ALPHA_I, c12
+ MADD a2, a2, ALPHA_I, c11
+
+ ST a1, 0 * SIZE(CO1)
+ ST a2, 1 * SIZE(CO1)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, K, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll TEMP, TEMP, ZBASE_SHIFT
+
+ daddu AO, AO, TEMP
+ daddu BO, BO, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+
daddiu CO1,CO1, 2 * SIZE
+#endif