From 59d2cfa644789a360a2819bbe7891b37e4bfcb43 Mon Sep 17 00:00:00 2001
From: yang <yang.zhang@arm.com>
Date: Fri, 20 Jul 2012 11:50:26 +0800
Subject: [PATCH] fix bugs in sub

---
 source/NE10_sub.neon.s | 245 +++++++++++++++++++------------------------------
 1 file changed, 92 insertions(+), 153 deletions(-)

diff --git a/source/NE10_sub.neon.s b/source/NE10_sub.neon.s
index a29a342..6bd660e 100644
--- a/source/NE10_sub.neon.s
+++ b/source/NE10_sub.neon.s
@@ -26,7 +26,7 @@
 
 
 
-        .balign   4
+        .align   4
         .global   sub_float_neon
         .thumb
         .thumb_func
@@ -56,42 +56,32 @@ sub_float_neon:
         cbz               r3, .L_check_float
 
         @ load the 1st set of values
-          vld1.32         {q0}, [r1]!
-          vld1.32         {q1}, [r2]!
-          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+        vld1.32         {q0}, [r1]!
+        vld1.32         {q1}, [r2]!
+        subs            r3, r3, #4          @ 4 for this set
 
         @ calculate values for the 1st set
-          vsub.f32        q3, q0, q1         @ q3 = q0 - q1
+        vsub.f32        q3, q0, q1         @ q3 = q0 - q1
 
-        @ load the 2nd set of values
-          vld1.32         {q0}, [r1]!
-          vld1.32         {q1}, [r2]!
-
-          ble             .L_mainloopend_float
+        ble             .L_mainloopend_float
 
 .L_mainloop_float:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst1.32         {d6,d7}, [r0]!
+        @ store the result for the current set
+        vst1.32         {d6,d7}, [r0]!
 
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vsub.f32        q3, q0, q1         @ q3 = q0 - q1
+        @ load the next set of values
+        vld1.32         {q0}, [r1]!
+        vld1.32         {q1}, [r2]!
+        subs            r3, r3, #4
 
-       @ load the next (e.g. 3rd) set of values
-          vld1.32         {q0}, [r1]!
-          vld1.32         {q1}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vsub.f32        q3, q0, q1         @ q3 = q0 - q1
 
-        bgt             .L_mainloop_float             @ loop if r2 is > r3, if we have at least another 4 floats
+        bgt             .L_mainloop_float             @ loop if r3 > 0, if we have at least another 4 floats
 
 .L_mainloopend_float:
         @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst1.32         {d6,d7}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vsub.f32        q3, q0, q1         @ q3 = q0 - q1
-
-        @ store the result for the last (e.g. 3rd) set
+        @ store the result for the last set
           vst1.32         {d6,d7}, [r0]!
 
 
@@ -124,7 +114,7 @@ sub_float_neon:
 
 
 
-        .balign   4
+        .align   4
         .global   sub_vec2f_neon
         .thumb
         .thumb_func
@@ -154,45 +144,34 @@ sub_vec2f_neon:
         cbz               r3, .L_check_vec2
 
         @ load the 1st set of values
-          vld2.32         {q0-q1}, [r1]!
-          vld2.32         {q2-q3}, [r2]!
-          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+        vld2.32         {q0-q1}, [r1]!
+        vld2.32         {q2-q3}, [r2]!
+        subs            r3, r3, #4          @ 4 for this set
 
         @ calculate values for the 1st set
-          vsub.f32        q8, q0, q2
-          vsub.f32        q9, q1, q3
-
-        @ load the 2nd set of values
-          vld2.32         {q0-q1}, [r1]!
-          vld2.32         {q2-q3}, [r2]!
+        vsub.f32        q8, q0, q2
+        vsub.f32        q9, q1, q3
 
-          ble             .L_mainloopend_vec2
+        ble             .L_mainloopend_vec2
 
 .L_mainloop_vec2:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst2.32         {d16,d17,d18,d19}, [r0]!
+        @ store the result for the current set
+        vst2.32         {d16,d17,d18,d19}, [r0]!
 
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vsub.f32        q8, q0, q2
-          vsub.f32        q9, q1, q3
+        @ load the next set of values
+        vld2.32         {q0-q1}, [r1]!
+        vld2.32         {q2-q3}, [r2]!
+        subs            r3, r3, #4
 
-       @ load the next (e.g. 3rd) set of values
-          vld2.32         {q0-q1}, [r1]!
-          vld2.32         {q2-q3}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vsub.f32        q8, q0, q2
+        vsub.f32        q9, q1, q3
 
-        bgt             .L_mainloop_vec2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+        bgt             .L_mainloop_vec2             @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
 
 .L_mainloopend_vec2:
         @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst2.32         {d16,d17,d18,d19}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vsub.f32        q8, q0, q2
-          vsub.f32        q9, q1, q3
-
-        @ store the result for the last (e.g. 3rd) set
+        @ store the result for the last set
           vst2.32         {d16,d17,d18,d19}, [r0]!
 
 .L_check_vec2:
@@ -223,7 +202,7 @@ sub_vec2f_neon:
 
 
 
-        .align  2
+        .align  4
         .global sub_vec3f_neon
         .thumb
         .thumb_func
@@ -253,56 +232,41 @@ sub_vec3f_neon:
         beq               .L_check_vec3
 
         @ load the 1st set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          vld3.32         {d18, d20, d22}, [r2]!
-          vld3.32         {d19, d21, d23}, [r2]!
-          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+        vld3.32         {d0, d2, d4}, [r1]!
+        vld3.32         {d1, d3, d5}, [r1]!
+        vld3.32         {d18, d20, d22}, [r2]!
+        vld3.32         {d19, d21, d23}, [r2]!
+        subs            r3, r3, #4          @ 4 for this set
 
         @ calculate values for the 1st set
-          vsub.f32        q12, q0, q9
-          vsub.f32        q13, q1, q10
-          vsub.f32        q14, q2, q11
-
-        @ load the 2nd set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          vld3.32         {d18, d20, d22}, [r2]!
-          vld3.32         {d19, d21, d23}, [r2]!
+        vsub.f32        q12, q0, q9
+        vsub.f32        q13, q1, q10
+        vsub.f32        q14, q2, q11
 
-          ble             .L_mainloopend_vec3
+        ble             .L_mainloopend_vec3
 
 .L_mainloop_vec3:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst3.32         {d24, d26, d28}, [r0]!
-          vst3.32         {d25, d27, d29}, [r0]!
+        @ store the result for the current set
+        vst3.32         {d24, d26, d28}, [r0]!
+        vst3.32         {d25, d27, d29}, [r0]!
 
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vsub.f32        q12, q0, q9
-          vsub.f32        q13, q1, q10
-          vsub.f32        q14, q2, q11
+        @ load the next set of values
+        vld3.32         {d0, d2, d4}, [r1]!
+        vld3.32         {d1, d3, d5}, [r1]!
+        vld3.32         {d18, d20, d22}, [r2]!
+        vld3.32         {d19, d21, d23}, [r2]!
+        subs            r3, r3, #4
 
-       @ load the next (e.g. 3rd) set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          vld3.32         {d18, d20, d22}, [r2]!
-          vld3.32         {d19, d21, d23}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vsub.f32        q12, q0, q9
+        vsub.f32        q13, q1, q10
+        vsub.f32        q14, q2, q11
 
-        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+        bgt               .L_mainloop_vec3             @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
 
 .L_mainloopend_vec3:
         @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst3.32         {d24, d26, d28}, [r0]!
-          vst3.32         {d25, d27, d29}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vsub.f32        q12, q0, q9
-          vsub.f32        q13, q1, q10
-          vsub.f32        q14, q2, q11
-
-        @ store the result for the last (e.g. 3rd) set
+        @ store the result for the last set
           vst3.32         {d24, d26, d28}, [r0]!
           vst3.32         {d25, d27, d29}, [r0]!
 
@@ -342,7 +306,7 @@ sub_vec3f_neon:
 
 
 
-        .align  2
+        .align  4
         .global sub_vec4f_neon
         .thumb
         .thumb_func
@@ -372,60 +336,44 @@ sub_vec4f_neon:
         beq               .L_check_vec4
 
         @ load the 1st set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          vld4.32         {d16, d18, d20, d22}, [r2]!
-          vld4.32         {d17, d19, d21, d23}, [r2]!
+        vld4.32         {d0, d2, d4, d6}, [r1]!
+        vld4.32         {d1, d3, d5, d7}, [r1]!
+        vld4.32         {d16, d18, d20, d22}, [r2]!
+        vld4.32         {d17, d19, d21, d23}, [r2]!
 
-          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+        subs            r3, r3, #4          @ 4 for this set
 
         @ calculate values for the 1st set
-          vsub.f32        q12, q0, q8
-          vsub.f32        q13, q1, q9
-          vsub.f32        q14, q2, q10
-          vsub.f32        q15, q3, q11
-
-        @ load the 2nd set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          vld4.32         {d16, d18, d20, d22}, [r2]!
-          vld4.32         {d17, d19, d21, d23}, [r2]!
+        vsub.f32        q12, q0, q8
+        vsub.f32        q13, q1, q9
+        vsub.f32        q14, q2, q10
+        vsub.f32        q15, q3, q11
 
-          ble             .L_mainloopend_vec4
+        ble             .L_mainloopend_vec4
 
 .L_mainloop_vec4:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst4.32         {d24, d26, d28, d30}, [r0]!
-          vst4.32         {d25, d27, d29, d31}, [r0]!
+        @ store the result for the current set
+        vst4.32         {d24, d26, d28, d30}, [r0]!
+        vst4.32         {d25, d27, d29, d31}, [r0]!
 
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vsub.f32        q12, q0, q8
-          vsub.f32        q13, q1, q9
-          vsub.f32        q14, q2, q10
-          vsub.f32        q15, q3, q11
+        @ load the next set of values
+        vld4.32         {d0, d2, d4, d6}, [r1]!
+        vld4.32         {d1, d3, d5, d7}, [r1]!
+        vld4.32         {d16, d18, d20, d22}, [r2]!
+        vld4.32         {d17, d19, d21, d23}, [r2]!
+        subs            r3, r3, #4
 
-       @ load the next (e.g. 3rd) set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          vld4.32         {d16, d18, d20, d22}, [r2]!
-          vld4.32         {d17, d19, d21, d23}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vsub.f32        q12, q0, q8
+        vsub.f32        q13, q1, q9
+        vsub.f32        q14, q2, q10
+        vsub.f32        q15, q3, q11
 
-        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+        bgt               .L_mainloop_vec4             @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
 
 .L_mainloopend_vec4:
         @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst4.32         {d24, d26, d28, d30}, [r0]!
-          vst4.32         {d25, d27, d29, d31}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vsub.f32        q12, q0, q8
-          vsub.f32        q13, q1, q9
-          vsub.f32        q14, q2, q10
-          vsub.f32        q15, q3, q11
-
-        @ store the result for the last (e.g. 3rd) set
+        @ store the result for the last set
           vst4.32         {d24, d26, d28, d30}, [r0]!
           vst4.32         {d25, d27, d29, d31}, [r0]!
 
@@ -436,26 +384,17 @@ sub_vec4f_neon:
 
 .L_secondloop_vec4:
      @ process the last few items left in the input array
-        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
-                                                                  @      q0 = { V1.x, -, -, - };
-                                                                  @      q1 = { V1.y, -, -, - };
-                                                                  @      q2 = { V1.z, -, -, - };
-                                                                  @      q3 = { V1.w, -, -, - };
-        vld4.f32          {d1[0], d3[0], d5[0], d7[0]}, [r2]!     @ The values are loaded like so:
-                                                                  @      q0 = { V1.x, -, V2.x, - };
-                                                                  @      q1 = { V1.y, -, V2.y, - };
-                                                                  @      q2 = { V1.z, -, V2.z, - };
-                                                                  @      q3 = { V1.w, -, V2.w, - };
+        vld1.f32          {d0, d1}, [r1]!     @ The values are loaded like so:
+                                                                  @      q0 = { V1.x, V1.y, V1.z, V1.w };
+        vld1.f32          {d2, d3}, [r2]!     @ The values are loaded like so:
+                                                                  @      q1 = { V2.x, V2.y, V2.z, V2.w };
 
         subs              r4, r4, #1
 
         @ calculate values
-        vsub.f32          d0, d0, d1
-        vsub.f32          d2, d2, d3
-        vsub.f32          d4, d4, d5
-        vsub.f32          d6, d6, d7
+        vsub.f32          q0, q0, q1
 
-        vst4.32          {d0[0], d2[0], d4[0], d6[0]}, [r0]!
+        vst1.32          {d0, d1}, [r0]!
 
         bgt               .L_secondloop_vec4
 
-- 
2.7.4