#define N [fp, #-260 ]
#define K [fp, #-264 ]
+#define FP_ZERO [fp, #-240]
+#define FP_ZERO_0 [fp, # -240]
+#define FP_ZERO_1 [fp, # -236]
+
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
.macro INIT2x2
- vsub.f32 s8 , s8 , s8
+ flds s8 , FP_ZERO
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
.macro INIT1x2
- vsub.f32 s8 , s8 , s8
+ flds s8 , FP_ZERO
vmov.f32 s9 , s8
vmov.f32 s12, s8
vmov.f32 s13, s8
.macro INIT2x1
- vsub.f32 s8 , s8 , s8
+ flds s8 , FP_ZERO
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
.macro INIT1x1
- vsub.f32 s8 , s8 , s8
+ flds s8 , FP_ZERO
vmov.f32 s9 , s8
.endm
sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers
+ movs r4, #0
+ str r4, FP_ZERO
+ str r4, FP_ZERO_1
+
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 4 * 2
str r3, LDC
#define K [fp, #-264 ]
#define A [fp, #-268 ]
+#define FP_ZERO [fp, #-240]
+#define FP_ZERO_0 [fp, # -240]
+#define FP_ZERO_1 [fp, # -236]
+
#define ALPHA [fp, #-280]
+
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
.macro INIT4x2
- vsub.f64 d8 , d8 , d8
+ fldd d8, FP_ZERO
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
.macro INIT2x2
- vsub.f64 d8 , d8 , d8
+ fldd d8, FP_ZERO
vmov.f64 d9, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
.macro INIT1x2
- vsub.f64 d8 , d8 , d8
+ fldd d8, FP_ZERO
vmov.f64 d12, d8
.endm
.macro INIT4x1
- vsub.f64 d8 , d8 , d8
+ fldd d8, FP_ZERO
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
.macro INIT2x1
- vsub.f64 d8 , d8 , d8
+ fldd d8, FP_ZERO
vmov.f64 d9 , d8
.endm
.macro INIT1x1
- vsub.f64 d8 , d8 , d8
+ fldd d8, FP_ZERO
.endm
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
+ movs r4, #0
+ str r4, FP_ZERO
+ str r4, FP_ZERO_1
+
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 8
str r3, LDC
#define K [fp, #-264 ]
#define A [fp, #-268 ]
+#define FP_ZERO [fp, #-240]
+#define FP_ZERO_0 [fp, # -240]
+#define FP_ZERO_1 [fp, # -236]
+
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
.macro INIT4x2
- vsub.f32 s8 , s8 , s8
+ flds s8, FP_ZERO
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
.macro INIT2x2
- vsub.f32 s8 , s8 , s8
+ flds s8, FP_ZERO
vmov.f32 s9, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
.macro INIT1x2
- vsub.f32 s8 , s8 , s8
+ flds s8, FP_ZERO
vmov.f32 s12, s8
.endm
.macro INIT4x1
- vsub.f32 s8 , s8 , s8
+ flds s8, FP_ZERO
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
.macro INIT2x1
- vsub.f32 s8 , s8 , s8
+ flds s8, FP_ZERO
vmov.f32 s9 , s8
.endm
.macro INIT1x1
- vsub.f32 s8 , s8 , s8
+ flds s8, FP_ZERO
.endm
sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers
+ movs r4, #0
+ str r4, FP_ZERO
+ str r4, FP_ZERO_1
+
ldr r3, OLD_LDC
lsl r3, r3, #2 // ldc = ldc * 4
str r3, LDC
#define N [fp, #-260 ]
#define K [fp, #-264 ]
+#define FP_ZERO [fp, #-240]
+#define FP_ZERO_0 [fp, # -240]
+#define FP_ZERO_1 [fp, # -236]
+
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
.macro INIT2x2
- vsub.f64 d8 , d8 , d8
+ fldd d8 , FP_ZERO
vmov.f64 d9 , d8
vmov.f64 d10, d8
vmov.f64 d11, d8
.macro INIT1x2
- vsub.f64 d8 , d8 , d8
+ fldd d8 , FP_ZERO
vmov.f64 d9 , d8
vmov.f64 d12, d8
vmov.f64 d13, d8
.macro INIT2x1
- vsub.f64 d8 , d8 , d8
+ fldd d8 , FP_ZERO
vmov.f64 d9 , d8
vmov.f64 d10, d8
vmov.f64 d11, d8
.macro INIT1x1
- vsub.f64 d8 , d8 , d8
+ fldd d8 , FP_ZERO
vmov.f64 d9 , d8
.endm
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
+ movs r4, #0
+ str r4, FP_ZERO
+ str r4, FP_ZERO_1
+
ldr r3, OLD_LDC
lsl r3, r3, #4 // ldc = ldc * 8 * 2
str r3, LDC