#define STACKSIZE 256
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR r3
+#define OLD_ALPHAI [fp, #0 ]
+#define OLD_A_SOFTFP [fp, #4 ]
+#define OLD_LDA [fp, #8 ]
+#define X [fp, #12 ]
+#define OLD_INC_X [fp, #16 ]
+#define Y [fp, #20 ]
+#define OLD_INC_Y [fp, #24 ]
+#else
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#endif
+
#define OLD_A r3
#define OLD_N r1
#define I r12
+#define FP_ZERO [fp, #-228]
+#define FP_ZERO_0 [fp, #-228]
+#define FP_ZERO_1 [fp, #-224]
+
#define N [fp, #-252 ]
#define A [fp, #-256 ]
#if !defined(CONJ) && !defined(XCONJ)
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
.macro INIT_F2
- vsub.f32 s12, s12, s12
- vsub.f32 s13, s13, s13
- vsub.f32 s14, s14, s14
- vsub.f32 s15, s15, s15
+ flds s12, FP_ZERO
+ vmov.f32 s13, s12
+ vmov.f32 s14, s12
+ vmov.f32 s15, s12
.endm
.macro INIT_F1
- vsub.f32 s12, s12, s12
- vsub.f32 s13, s13, s13
+ flds s12, FP_ZERO
+ vmov.f32 s13, s12
.endm
.macro INIT_S2
- vsub.f32 s12, s12, s12
- vsub.f32 s13, s13, s13
- vsub.f32 s14, s14, s14
- vsub.f32 s15, s15, s15
+ flds s12, FP_ZERO
+ vmov.f32 s13, s12
+ vmov.f32 s14, s12
+ vmov.f32 s15, s12
.endm
.macro INIT_S1
- vsub.f32 s12, s12, s12
- vsub.f32 s13, s13, s13
+ flds s12, FP_ZERO
+ vmov.f32 s13, s12
.endm
vstm r12, { s8 - s15 } // store floating point registers
#endif
+ movs r12, #0
+ str r12, FP_ZERO
+ str r12, FP_ZERO_1
+
cmp M, #0
ble cgemvt_kernel_L999
cmp OLD_N, #0
ble cgemvt_kernel_L999
+#if !defined(__ARM_PCS_VFP)
+ vmov s0, OLD_ALPHAR
+ vldr s1, OLD_ALPHAI
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_A, A
str OLD_N, N