fix the bugs in abs

author yang <yang.zhang@arm.com>

Wed, 18 Jul 2012 07:16:31 +0000 (15:16 +0800)

committer yang <yang.zhang@arm.com>

Wed, 18 Jul 2012 07:16:31 +0000 (15:16 +0800)
author yang <yang.zhang@arm.com>
Wed, 18 Jul 2012 07:16:31 +0000 (15:16 +0800)
committer yang <yang.zhang@arm.com>
Wed, 18 Jul 2012 07:16:31 +0000 (15:16 +0800)
diff --git a/source/NE10_abs.neon.s b/source/NE10_abs.neon.s

index 15c96f9..c0ece9b 100644 (file)
--- a/source/NE10_abs.neon.s
+++ b/source/NE10_abs.neon.s
@@ -26,7 +26,7 @@
  
  
  
-        .balign   4
+        .align   4
          .global   abs_float_neon
          .thumb
          .thumb_func
@@ -42,79 +42,58 @@ abs_float_neon:
          @  r1: *src & current src entry's address
          @  r2: int count & the number of items in the input array that can be
          @                   processed in chunks of 4 vectors
-        @  r3: the number of items that are left to be processed at the end of
+        @  r3: the number of items that are residual that will be processed at the begin of
          @                   the input array
          @
          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  
-        and               r3, r2, #3          @ r3 = count % 4;
-        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
-
-        cbz               r2, .L_check_float
-
-        @ load the 1st set of values
-          vld1.32         {q0}, [r1]!
-          subs            r2, r2, #8          @ 4 for this set, and 4 for the 2nd set
-
-        @ absolute values of the 1st set
-          vabs.f32        q3, q0         @ q3 = abs( q0 )
+        and               r3, r2, #3          @ r3 = count % 4; calculate the residual loop
+        asr               r2, r2, #2          @ r2 = count >> 2; calculate the main loop
  
-        @ load the 2nd set of values
-          vld1.32         {q0}, [r1]!
+        cbz               r3, .L_check_mainloop_float
  
-          ble             .L_mainloopend_float
+.L_residualloop_float:
+        @ process the residual items in the input array
+        vld1.f32          d0[0], [r1]!           @ Fill in d0 = { V.x, 0 };
  
-.L_mainloop_float:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst1.32         {d6,d7}, [r0]!
-
-        @ absolute values of the 2nd/next (e.g. 3rd) set
-          vabs.f32        q3, q0         @ q3 = abs( q0 )
-
-       @ load the next (e.g. 3rd) set of values
-        vld1.32           {q0}, [r1]!
-        subs              r2, r2, #4
+        subs              r3, r3, #1
  
-        bgt             .L_mainloop_float             @ loop if r2 is > r3, if we have at least another 4 floats
+        @ absolute values
+        vabs.f32          d0, d0
  
-.L_mainloopend_float:
-        @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst1.32         {d6,d7}, [r0]!
+        vst1.32           {d0[0]}, [r0]!
  
-        @ absolute values of the last (e.g. 3rd) set
-          vabs.f32        q3, q0
+        bgt               .L_residualloop_float
  
-        @ store the result for the last (e.g. 3rd) set
-          vst1.32         {d6,d7}, [r0]!
+.L_check_mainloop_float:
+        cbz               r2, .L_return_float
  
-.L_check_float:
-     @ check if anything left to process at the end of the input array
-        cmp               r3, #0
-        ble               .L_return_float
+        @ load the current set of values
+        vld1.32         {q0}, [r1]!         @ for current set
  
-.L_secondloop_float:
-     @ process the last few items left in the input array
-        vld1.f32          d0[0], [r1]!           @ Fill in d0 = { V.x, V.y };
+.L_mainloop_float:
+        @ absolute values of the current set
+        vabs.f32        q3, q0         @ q3 = abs( q0 )
  
-        subs              r3, r3, #1
+        @ store the result for the current set
+        vst1.32         {d6,d7}, [r0]!
  
-        @ absolute values
-        vabs.f32          d0, d0
+        subs              r2, r2, #1
  
-        vst1.32           {d0[0]}, [r0]!
+        @ load the next set
+        vld1.32           {q0}, [r1]!
  
-        bgt               .L_secondloop_float
+        bgt             .L_mainloop_float             @ loop if r2 > 0, if we have another 4 floats
  
  .L_return_float:
-     @ return
+        @ return
          mov               r0, #0
          bx                lr
  
  
  
  
-        .balign   4
+        .align   4
          .global   abs_vec2f_neon
          .thumb
          .thumb_func
@@ -135,77 +114,52 @@ abs_vec2f_neon:
          @
          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  
-        and               r3, r2, #3          @ r3 = count % 4;
-        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
-
-        cbz               r2, .L_check_vec2
-
-        @ load the 1st set of values
-          vld2.32         {q0-q1}, [r1]!
-          subs            r2, r2, #8          @ 4 for this set, and 4 for the 2nd set
+        and               r3, r2, #3          @ r3 = count % 4; calculate the residual loop
+        asr               r2, r2, #2          @ r2 = count >> 2; calculate the main loop
  
-        @ absolute values of the 1st set
-          vabs.f32        q3, q0         @ q3 = abs( q0 )
-          vabs.f32        q4, q1         @ q4 = abs( q1 )
-
-        @ load the 2nd set of values
-          vld2.32         {q0-q1}, [r1]!
-
-          ble             .L_mainloopend_vec2
-
-.L_mainloop_vec2:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst2.32         {d6,d7,d8,d9}, [r0]!
+        cbz               r3, .L_check_mainloop_vec2
  
-        @ absolute values of the 2nd/next (e.g. 3rd) set
-          vabs.f32        q3, q0         @ q3 = abs( q0 )
-          vabs.f32        q4, q1         @ q4 = abs( q1 )
+.L_residualloop_vec2:
+        @ process the residual items in the input array
+        vld1.f32          d0, [r1]!           @ Fill in d0 = { V.x, V.y };
  
-       @ load the next (e.g. 3rd) set of values
-          vld2.32         {q0-q1}, [r1]!
-          subs            r2, r2, #4
+        subs              r3, r3, #1
  
-        bgt             .L_mainloop_vec2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+        @ absolute values
+        vabs.f32          d0, d0
  
-.L_mainloopend_vec2:
-        @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst2.32         {d6,d7,d8,d9}, [r0]!
+        vst1.32           {d0}, [r0]!
  
-        @ absolute values of the last (e.g. 3rd) set
-          vabs.f32        q3, q0         @ q3 = abs( q0 )
-          vabs.f32        q4, q1         @ q4 = abs( q1 )
+        bgt               .L_residualloop_vec2
  
-        @ store the result for the last (e.g. 3rd) set
-          vst2.32         {d6,d7,d8,d9}, [r0]!
+.L_check_mainloop_vec2:
+        cbz               r2, .L_return_vec2
  
-.L_check_vec2:
-     @ check if anything left to process at the end of the input array
-        cmp               r3, #0
-        ble               .L_return_vec2
+        @ load the current set of values
+        vld2.32         {q0-q1}, [r1]!        @ for current set
  
-.L_secondloop_vec2:
-     @ process the last few items left in the input array
-        vld1.f32          d0, [r1]!           @ Fill in d0 = { V.x, V.y };
+.L_mainloop_vec2:
+        @ absolute values of the current set
+        vabs.f32        q3, q0         @ q3 = abs( q0 )
+        vabs.f32        q4, q1         @ q4 = abs( q1 )
  
-        subs              r3, r3, #1
+        @ store the result for the current set
+        vst2.32         {d6,d7,d8,d9}, [r0]!
  
-        @ absolute values
-        vabs.f32          d0, d0
+        subs              r2, r2, #1
  
-        vst1.32           {d0}, [r0]!
+        @ load the next set
+        vld2.32         {q0-q1}, [r1]!
  
-        bgt               .L_secondloop_vec2
+        bgt             .L_mainloop_vec2             @ loop if r2 > 0, if we have another 4 vec2s
  
  .L_return_vec2:
-     @ return
+        @ return
          mov               r0, #0
          bx                lr
  
  
-
-
-        .align  2
+        .align  4
          .global abs_vec3f_neon
          .thumb
          .thumb_func
@@ -225,67 +179,13 @@ abs_vec3f_neon:
          @
          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  
-        and               r3, r2, #3          @ r3 = count % 4;
-        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
-
-        cmp               r2, #0
-        beq               .L_check_vec3
-
-        @ load the 1st set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          subs            r2, r2, #4          @ 4 for this set, and 4 for the 2nd set
+        and               r3, r2, #3          @ r3 = count % 4; calculate the residual loop
+        asr               r2, r2, #2          @ r2 = count >> 2; calculate the main loop
  
-        @ absolute values of the 1st set
-          vabs.f32        q5, q0
-          vabs.f32        q6, q1
-          vabs.f32        q7, q2
+        cbz               r3, .L_check_mainloop_vec3
  
-        @ load the 2nd set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-
-          ble             .L_mainloopend_vec3
-
-.L_mainloop_vec3:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst3.32         {d10, d12, d14}, [r0]!
-          vst3.32         {d11, d13, d15}, [r0]!
-
-        @ absolute values of the 2nd/next (e.g. 3rd) set
-          vabs.f32        q5, q0
-          vabs.f32        q6, q1
-          vabs.f32        q7, q2
-
-       @ load the next (e.g. 3rd) set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          subs            r2, r2, #4
-
-        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
-        @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst3.32         {d10, d12, d14}, [r0]!
-          vst3.32         {d11, d13, d15}, [r0]!
-
-        @ absolute values of the last (e.g. 3rd) set
-          vabs.f32        q5, q0
-          vabs.f32        q6, q1
-          vabs.f32        q7, q2
-
-        @ store the result for the last (e.g. 3rd) set
-          vst3.32         {d10, d12, d14}, [r0]!
-          vst3.32         {d11, d13, d15}, [r0]!
-
-.L_check_vec3:
-     @ check if anything left to process at the end of the input array
-        cmp               r3, #0
-        ble               .L_return_vec3
-
-.L_secondloop_vec3:
-     @ process the last few items left in the input array
+.L_residualloop_vec3:
+        @ process the residual items in the input array
          vld3.f32          {d0[0], d2[0], d4[0]}, [r1]!     @ The values are loaded like so:
                                                             @      q0 = { V.x, -, -, - };
                                                             @      q1 = { V.y, -, -, - };
@@ -294,22 +194,47 @@ abs_vec3f_neon:
  
          @ absolute values
          vabs.f32          d0, d0
-        vabs.f32          d1, d1
          vabs.f32          d2, d2
+        vabs.f32          d4, d4
  
          vst3.32           {d0[0], d2[0], d4[0]}, [r0]!
  
-        bgt               .L_secondloop_vec3
+        bgt               .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+        cbz               r2, .L_return_vec3
+
+        @ load the current set of values
+        vld3.32         {d0, d2, d4}, [r1]!
+        vld3.32         {d1, d3, d5}, [r1]!        @ for current set
+
+.L_mainloop_vec3:
+        @ absolute values of the current set
+        vabs.f32        q5, q0
+        vabs.f32        q6, q1
+        vabs.f32        q7, q2
+
+        @ store the result for the current set
+        vst3.32         {d10, d12, d14}, [r0]!
+        vst3.32         {d11, d13, d15}, [r0]!
+
+        subs              r2, r2, #1
+
+        @ load the next set
+        vld3.32         {d0, d2, d4}, [r1]!
+        vld3.32         {d1, d3, d5}, [r1]!        @ for next set
+
+        bgt             .L_mainloop_vec3             @ loop if r2 > 0, if we have another 4 vec3s
  
  .L_return_vec3:
-     @ return
+        @ return
          mov               r0, #0
          bx                lr
  
  
  
  
-        .align  2
+        .align  4
          .global abs_vec4f_neon
          .thumb
          .thumb_func
@@ -329,87 +254,52 @@ abs_vec4f_neon:
          @
          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  
-        and               r3, r2, #3          @ r3 = count % 4;
-        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+        and               r3, r2, #3          @ r3 = count % 4; calculate the residual loop
+        asr               r2, r2, #2          @ r2 = count >> 2; calculate the main loop
  
-        cmp               r2, #0
-        beq               .L_check_vec4
+        cbz               r3, .L_check_mainloop_vec4
+
+.L_residualloop_vec4:
+        @ process the residual items in the input array
+        vld1.f32          {d0, d1}, [r1]!     @ The values are loaded like so:
+                                              @      q0 = { V.x, V.y, V.z, V.w };
+        subs              r3, r3, #1
+
+        @ absolute values
+        vabs.f32          q0, q0
  
-        @ load the 1st set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          subs            r2, r2, #8          @ 4 for this set, and 4 for the 2nd set
+        vst1.32          {d0, d1}, [r0]!
  
-        @ absolute values of the 1st set
-          vabs.f32        q10, q0
-          vabs.f32        q11, q1
-          vabs.f32        q12, q2
-          vabs.f32        q13, q3
+        bgt               .L_residualloop_vec4
  
-        @ load the 2nd set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
+.L_check_mainloop_vec4:
+        cbz               r2, .L_return_vec4
  
-          ble             .L_mainloopend_vec4
+        @ load the current set of values
+        vld4.32         {d0, d2, d4, d6}, [r1]!
+        vld4.32         {d1, d3, d5, d7}, [r1]!     @ for current set
  
  .L_mainloop_vec4:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst4.32         {d20, d22, d24, d26}, [r0]!
-          vst4.32         {d21, d23, d25, d27}, [r0]!
-
-        @ absolute values of the 2nd/next (e.g. 3rd) set
-          vabs.f32        q10, q0
-          vabs.f32        q11, q1
-          vabs.f32        q12, q2
-          vabs.f32        q13, q3
-
-       @ load the next (e.g. 3rd) set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          subs            r2, r2, #4
-
-        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_vec4:
-        @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst4.32         {d20, d22, d24, d26}, [r0]!
-          vst4.32         {d21, d23, d25, d27}, [r0]!
-
-        @ absolute values of the last (e.g. 3rd) set
-          vabs.f32        q10, q0
-          vabs.f32        q11, q1
-          vabs.f32        q12, q2
-          vabs.f32        q13, q3
-
-        @ store the result for the last (e.g. 3rd) set
-          vst4.32         {d20, d22, d24, d26}, [r0]!
-          vst4.32         {d21, d23, d25, d27}, [r0]!
-
-.L_check_vec4:
-     @ check if anything left to process at the end of the input array
-        cmp               r3, #0
-        ble               .L_return_vec4
-
-.L_secondloop_vec4:
-     @ process the last few items left in the input array
-        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
-                                                                  @      q0 = { V.x, -, -, - };
-                                                                  @      q1 = { V.y, -, -, - };
-                                                                  @      q2 = { V.z, -, -, - };
-        subs              r3, r3, #1
+        @ absolute values of the current set
+        vabs.f32        q10, q0
+        vabs.f32        q11, q1
+        vabs.f32        q12, q2
+        vabs.f32        q13, q3
  
-        @ absolute values
-        vabs.f32          d0, d0
-        vabs.f32          d2, d2
-        vabs.f32          d4, d4
-        vabs.f32          d6, d6
+        @ store the result for the current set
+        vst4.32         {d20, d22, d24, d26}, [r0]!
+        vst4.32         {d21, d23, d25, d27}, [r0]!
+
+        subs              r2, r2, #1
  
-        vst4.32          {d0[0], d2[0], d4[0], d6[0]}, [r0]!     @ The values are loaded like so:
+        @ load the next set
+        vld4.32         {d0, d2, d4, d6}, [r1]!
+        vld4.32         {d1, d3, d5, d7}, [r1]!      @ for next set
  
-        bgt               .L_secondloop_vec4
+        bgt             .L_mainloop_vec4             @ loop if r2 > 0, if we have another 4 vec4s
  
  .L_return_vec4:
-     @ return
+        @ return
          mov               r0, #0
          bx                lr
+
author	yang <yang.zhang@arm.com>
	Wed, 18 Jul 2012 07:16:31 +0000 (15:16 +0800)
committer	yang <yang.zhang@arm.com>
	Wed, 18 Jul 2012 07:16:31 +0000 (15:16 +0800)