#define N [fp, #-260 ]
#define K [fp, #-264 ]
+#define FP_ZERO [fp, #-236]
+#define FP_ZERO_0 [fp, #-236]
+#define FP_ZERO_1 [fp, #-232]
+
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP [fp, #4]
+#define OLD_ALPHAI_SOFTFP [fp, #12]
+#define OLD_A_SOFTFP [fp, #20 ]
+#define B [fp, #24 ]
+#define C [fp, #28 ]
+#define OLD_LDC [fp, #32 ]
+#define OFFSET [fp, #36 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
#define B_PRE 96
#define C_PRE 64
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmuld
- #define FMAC_R2 fnmacd
+ #define FMAC_R1 vnmul.f64
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
- #define FMAC_I2 fnmacd
+ #define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT)
#define FMAC_R1 fmuld
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmuld
+ #define FMAC_I1 vnmul.f64
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
#define FADD_I fsubd
#define FMAC_R1 fmuld
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
#define FMAC_I2 fmacd
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmuld
+ #define FMAC_R1 vnmul.f64
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmuld
- #define FMAC_I2 fnmacd
+ #define FMAC_I1 vnmul.f64
+ #define FMAC_I2 vmls.f64
#endif
.macro INIT2x2
- vsub.f64 d16 , d16 , d16
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
.macro INIT1x2
- vsub.f64 d16 , d16 , d16
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
.macro INIT2x1
- vsub.f64 d16 , d16 , d16
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
.macro INIT1x1
- vsub.f64 d16 , d16 , d16
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d24, d16
vmov.f64 d25, d16
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
+ movs r4, #0
+ str r4, FP_ZERO
+ str r4, FP_ZERO_1
+
ldr r3, OLD_LDC
lsl r3, r3, #4 // ldc = ldc * 8 * 2
str r3, LDC
ble _L1_BEGIN
_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
b _L2_M2_44
-
+
_L2_M2_30:
tst L, #3
ble _L2_M2_40
subs L, L, #1
bne _L2_M2_46
-
+
_L2_M2_100:
SAVE2x2
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
lsl r4, r4, #5 // k * 2 * 8 * 2
add r3, r3, r4 // B = B + K * 4 * 8
mov BC, r3
-
+
#if !defined(LEFT)
ldr r3 , KK
add r3 , r3 , #2 // number of values in BO
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
b _L1_M2_44
-
+
_L1_M2_30:
tst L, #3
ble _L1_M2_40
subs L, L, #1
bne _L1_M2_46
-
+
_L1_M2_100:
SAVE2x1
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1