#define N [fp, #-260 ]
#define K [fp, #-264 ]
+#define FP_ZERO [fp, #-236]
+#define FP_ZERO_0 [fp, #-236]
+#define FP_ZERO_1 [fp, #-232]
+
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP [fp, #4]
+#define OLD_ALPHAI_SOFTFP [fp, #12]
+#define OLD_A_SOFTFP [fp, #20 ]
+#define B [fp, #24 ]
+#define C [fp, #28 ]
+#define OLD_LDC [fp, #32 ]
+#define OFFSET [fp, #36 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmuld
- #define FMAC_R2 fnmacd
+ #define FMAC_R1 vnmul.f64
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
- #define FMAC_I2 fnmacd
+ #define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT)
#define FMAC_R1 fmuld
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmuld
+ #define FMAC_I1 vnmul.f64
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
#define FADD_I fsubd
#define FMAC_R1 fmuld
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
#define FMAC_I2 fmacd
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmuld
+ #define FMAC_R1 vnmul.f64
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmuld
- #define FMAC_I2 fnmacd
+ #define FMAC_I1 vnmul.f64
+ #define FMAC_I2 vmls.f64
#endif
.macro INIT2x2
- vldr.f64 d16 , =0
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
.macro INIT1x2
- vldr.f64 d16 , =0
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
.macro INIT2x1
- vldr.f64 d16 , =0
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
.macro INIT1x1
- vldr.f64 d16 , =0
+ fldd d16 , FP_ZERO
vmov.f64 d17, d16
vmov.f64 d24, d16
vmov.f64 d25, d16
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
+ movs r4, #0
+ str r4, FP_ZERO
+ str r4, FP_ZERO_1
+
ldr r3, OLD_LDC
lsl r3, r3, #4 // ldc = ldc * 8 * 2
str r3, LDC