fix bugs in mul

author yang <yang.zhang@arm.com>

Fri, 20 Jul 2012 02:58:03 +0000 (10:58 +0800)

committer yang <yang.zhang@arm.com>

Fri, 20 Jul 2012 02:58:03 +0000 (10:58 +0800)
author yang <yang.zhang@arm.com>
Fri, 20 Jul 2012 02:58:03 +0000 (10:58 +0800)
committer yang <yang.zhang@arm.com>
Fri, 20 Jul 2012 02:58:03 +0000 (10:58 +0800)
diff --git a/source/NE10_mul.neon.s b/source/NE10_mul.neon.s

index e420d8ee7f6c59c3a86bd07b5228a67aa5c577b8..dc2aec561077e911aec008f9ae75df72fa59e181 100644 (file)
--- a/source/NE10_mul.neon.s
+++ b/source/NE10_mul.neon.s
@@ -26,7 +26,7 @@
  
  
  
-        .balign   4
+        .align   4
          .global   mul_float_neon
          .thumb
          .thumb_func
@@ -55,46 +55,35 @@ mul_float_neon:
  
          cbz               r3, .L_check_float
  
-        @ load the 1st set of values
-          vld1.32         {q0}, [r1]!
-          vld1.32         {q1}, [r2]!
-          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
-
-        @ calculate values for the 1st set
-          vmul.f32        q3, q0, q1         @ q3 = q0 * q1
+        @ load the current set of values
+        vld1.32         {q0}, [r1]!
+        vld1.32         {q1}, [r2]!
+        subs            r3, r3, #4          @ 4 for this set
  
-        @ load the 2nd set of values
-          vld1.32         {q0}, [r1]!
-          vld1.32         {q1}, [r2]!
+        @ calculate values for the current set
+        vmul.f32        q3, q0, q1         @ q3 = q0 * q1
  
-          ble             .L_mainloopend_float
+        ble             .L_mainloopend_float
  
  .L_mainloop_float:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst1.32         {d6,d7}, [r0]!
+        @ store the result for the current set
+        vst1.32         {d6,d7}, [r0]!
  
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vmul.f32        q3, q0, q1         @ q3 = q0 * q1
+        @ load the next set of values
+        vld1.32         {q0}, [r1]!
+        vld1.32         {q1}, [r2]!
+        subs            r3, r3, #4
  
-       @ load the next (e.g. 3rd) set of values
-          vld1.32         {q0}, [r1]!
-          vld1.32         {q1}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vmul.f32        q3, q0, q1         @ q3 = q0 * q1
  
-        bgt             .L_mainloop_float             @ loop if r2 is > r3, if we have at least another 4 floats
+        bgt             .L_mainloop_float             @ loop if r3 > 0, if we have at least another 4 floats
  
  .L_mainloopend_float:
          @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst1.32         {d6,d7}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vmul.f32        q3, q0, q1         @ q3 = q0 * q1
-
-        @ store the result for the last (e.g. 3rd) set
+        @ store the result for the last one
            vst1.32         {d6,d7}, [r0]!
  
-
  .L_check_float:
       @ check if anything left to process at the end of the input array
          cmp               r4, #0
@@ -105,7 +94,6 @@ mul_float_neon:
          vld1.f32          d0[0], [r1]!           @ Fill in d0[0]
          vld1.f32          d1[0], [r2]!           @ Fill in d1[1]
  
-
          subs              r4, r4, #1
  
          @ values
@@ -116,7 +104,7 @@ mul_float_neon:
          bgt               .L_secondloop_float
  
  .L_return_float:
-     @ return
+        @ return
          pop               {r4}
          mov               r0, #0
          bx                lr
@@ -124,7 +112,7 @@ mul_float_neon:
  
  
  
-        .balign   4
+        .align   4
          .global   vmul_vec2f_neon
          .thumb
          .thumb_func
@@ -154,46 +142,35 @@ vmul_vec2f_neon:
          cbz               r3, .L_check_vec2
  
          @ load the 1st set of values
-          vld2.32         {q0-q1}, [r1]!
-          vld2.32         {q2-q3}, [r2]!
-          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+        vld2.32         {q0-q1}, [r1]!
+        vld2.32         {q2-q3}, [r2]!
+        subs            r3, r3, #4          @ 4 for this set
  
          @ calculate values for the 1st set
-          vmul.f32        q4, q0, q2
-          vmul.f32        q5, q1, q3
+        vmul.f32        q4, q0, q2
+        vmul.f32        q5, q1, q3
  
-        @ load the 2nd set of values
-          vld2.32         {q0-q1}, [r1]!
-          vld2.32         {q2-q3}, [r2]!
-
-          ble             .L_mainloopend_vec2
+        ble             .L_mainloopend_vec2
  
  .L_mainloop_vec2:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst2.32         {d8,d9,d10,d11}, [r0]!
+        @ store the result for the current set
+        vst2.32         {d8,d9,d10,d11}, [r0]!
  
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vmul.f32        q4, q0, q2
-          vmul.f32        q5, q1, q3
+        @ load the next set of values
+        vld2.32         {q0-q1}, [r1]!
+        vld2.32         {q2-q3}, [r2]!
+        subs            r3, r3, #4
  
-       @ load the next (e.g. 3rd) set of values
-          vld2.32         {q0-q1}, [r1]!
-          vld2.32         {q2-q3}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vmul.f32        q4, q0, q2
+        vmul.f32        q5, q1, q3
  
-        bgt             .L_mainloop_vec2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+        bgt             .L_mainloop_vec2             @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
  
  .L_mainloopend_vec2:
          @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst2.32         {d8,d9,d10,d11}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vmul.f32        q4, q0, q2
-          vmul.f32        q5, q1, q3
-
-        @ store the result for the last (e.g. 3rd) set
-          vst2.32         {d8,d9,d10,d11}, [r0]!
+        @ store the result for the last set
+        vst2.32         {d8,d9,d10,d11}, [r0]!
  
  .L_check_vec2:
       @ check if anything left to process at the end of the input array
@@ -215,7 +192,7 @@ vmul_vec2f_neon:
          bgt               .L_secondloop_vec2
  
  .L_return_vec2:
-     @ return
+        @ return
          pop               {r4}
          mov               r0, #0
          bx                lr
@@ -223,7 +200,7 @@ vmul_vec2f_neon:
  
  
  
-        .align  2
+        .align  4
          .global vmul_vec3f_neon
          .thumb
          .thumb_func
@@ -253,67 +230,51 @@ vmul_vec3f_neon:
          beq               .L_check_vec3
  
          @ load the 1st set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          vld3.32         {d6, d8, d10}, [r2]!
-          vld3.32         {d7, d9, d11}, [r2]!
-          subs            r3, r3, #4
+        vld3.32         {d0, d2, d4}, [r1]!
+        vld3.32         {d1, d3, d5}, [r1]!
+        vld3.32         {d6, d8, d10}, [r2]!
+        vld3.32         {d7, d9, d11}, [r2]!
+        subs            r3, r3, #4
  
          @ calculate values for the 1st set
-          vmul.f32        q10, q0, q3
-          vmul.f32        q11, q1, q4
-          vmul.f32        q12, q2, q5
-
-        @ load the 2nd set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          vld3.32         {d6, d8, d10}, [r2]!
-          vld3.32         {d7, d9, d11}, [r2]!
-          subs            r3, r3, #4
+        vmul.f32        q10, q0, q3
+        vmul.f32        q11, q1, q4
+        vmul.f32        q12, q2, q5
  
-          ble             .L_mainloopend_vec3
+        ble             .L_mainloopend_vec3
  
  .L_mainloop_vec3:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst3.32         {d20, d22, d24}, [r0]!
-          vst3.32         {d21, d23, d25}, [r0]!
+        @ store the result for the current set
+        vst3.32         {d20, d22, d24}, [r0]!
+        vst3.32         {d21, d23, d25}, [r0]!
  
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vmul.f32        q10, q0, q3
-          vmul.f32        q11, q1, q4
-          vmul.f32        q12, q2, q5
+        @ load the next set of values
+        vld3.32         {d0, d2, d4}, [r1]!
+        vld3.32         {d1, d3, d5}, [r1]!
+        vld3.32         {d6, d8, d10}, [r2]!
+        vld3.32         {d7, d9, d11}, [r2]!
+        subs            r3, r3, #4
  
-       @ load the next (e.g. 3rd) set of values
-          vld3.32         {d0, d2, d4}, [r1]!
-          vld3.32         {d1, d3, d5}, [r1]!
-          vld3.32         {d6, d8, d10}, [r2]!
-          vld3.32         {d7, d9, d11}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vmul.f32        q10, q0, q3
+        vmul.f32        q11, q1, q4
+        vmul.f32        q12, q2, q5
  
-        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+        bgt               .L_mainloop_vec3             @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
  
  .L_mainloopend_vec3:
          @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst3.32         {d20, d22, d24}, [r0]!
-          vst3.32         {d21, d23, d25}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vmul.f32        q10, q0, q3
-          vmul.f32        q11, q1, q4
-          vmul.f32        q12, q2, q5
-
-        @ store the result for the last (e.g. 3rd) set
+        @ store the result for the last set
            vst3.32         {d20, d22, d24}, [r0]!
            vst3.32         {d21, d23, d25}, [r0]!
  
  .L_check_vec3:
-     @ check if anything left to process at the end of the input array
+        @ check if anything left to process at the end of the input array
          cmp               r4, #0
          ble               .L_return_vec3
  
  .L_secondloop_vec3:
-     @ process the last few items left in the input array
+        @ process the last few items left in the input array
          vld3.f32          {d0[0], d2[0], d4[0]}, [r1]!     @ The values are loaded like so:
                                                             @      q0 = { V1.x, -, -, - };
                                                             @      q1 = { V1.y, -, -, - };
@@ -335,7 +296,7 @@ vmul_vec3f_neon:
          bgt               .L_secondloop_vec3
  
  .L_return_vec3:
-     @ return
+        @ return
          pop               {r4}
          mov               r0, #0
          bx                lr
@@ -343,7 +304,7 @@ vmul_vec3f_neon:
  
  
  
-        .align  2
+        .align  4
          .global vmul_vec4f_neon
          .thumb
          .thumb_func
@@ -373,63 +334,46 @@ vmul_vec4f_neon:
          beq               .L_check_vec4
  
          @ load the 1st set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          vld4.32         {d8, d10, d12, d14}, [r2]!
-          vld4.32         {d9, d11, d13, d15}, [r2]!
+        vld4.32         {d0, d2, d4, d6}, [r1]!
+        vld4.32         {d1, d3, d5, d7}, [r1]!
+        vld4.32         {d8, d10, d12, d14}, [r2]!
+        vld4.32         {d9, d11, d13, d15}, [r2]!
  
-          subs            r3, r3, #4
+        subs            r3, r3, #4
  
          @ calculate values for the 1st set
-          vmul.f32        q10, q0, q4
-          vmul.f32        q11, q1, q5
-          vmul.f32        q12, q2, q6
-          vmul.f32        q13, q3, q7
+        vmul.f32        q10, q0, q4
+        vmul.f32        q11, q1, q5
+        vmul.f32        q12, q2, q6
+        vmul.f32        q13, q3, q7
  
-        @ load the 2nd set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          vld4.32         {d8, d10, d12, d14}, [r2]!
-          vld4.32         {d9, d11, d13, d15}, [r2]!
-          subs            r3, r3, #4
-
-          ble             .L_mainloopend_vec4
+        ble             .L_mainloopend_vec4
  
  .L_mainloop_vec4:
-        @ store the result for the 1st/next (e.g. 3rd) set
-          vst4.32         {d20, d22, d24, d26}, [r0]!
-          vst4.32         {d21, d23, d25, d27}, [r0]!
+        @ store the result for current set
+        vst4.32         {d20, d22, d24, d26}, [r0]!
+        vst4.32         {d21, d23, d25, d27}, [r0]!
  
-        @ calculate values for the 2nd/next (e.g. 3rd) set
-          vmul.f32        q10, q0, q4
-          vmul.f32        q11, q1, q5
-          vmul.f32        q12, q2, q6
-          vmul.f32        q13, q3, q7
+        @ load the next set of values
+        vld4.32         {d0, d2, d4, d6}, [r1]!
+        vld4.32         {d1, d3, d5, d7}, [r1]!
+        vld4.32         {d8, d10, d12, d14}, [r2]!
+        vld4.32         {d9, d11, d13, d15}, [r2]!
+        subs            r3, r3, #4
  
-       @ load the next (e.g. 3rd) set of values
-          vld4.32         {d0, d2, d4, d6}, [r1]!
-          vld4.32         {d1, d3, d5, d7}, [r1]!
-          vld4.32         {d8, d10, d12, d14}, [r2]!
-          vld4.32         {d9, d11, d13, d15}, [r2]!
-          subs            r3, r3, #4
+        @ calculate values for the next set
+        vmul.f32        q10, q0, q4
+        vmul.f32        q11, q1, q5
+        vmul.f32        q12, q2, q6
+        vmul.f32        q13, q3, q7
  
-        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+        bgt               .L_mainloop_vec4             @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
  
  .L_mainloopend_vec4:
          @ the last iteration for this call
-        @ store the result for the set of values before the last one (e.g 2nd set)
-          vst4.32         {d20, d22, d24, d26}, [r0]!
-          vst4.32         {d21, d23, d25, d27}, [r0]!
-
-        @ calculate values for the last (e.g. 3rd) set
-          vmul.f32        q10, q0, q4
-          vmul.f32        q11, q1, q5
-          vmul.f32        q12, q2, q6
-          vmul.f32        q13, q3, q7
-
-        @ store the result for the last (e.g. 3rd) set
-          vst4.32         {d20, d22, d24, d26}, [r0]!
-          vst4.32         {d21, d23, d25, d27}, [r0]!
+        @ store the result for the last set
+        vst4.32         {d20, d22, d24, d26}, [r0]!
+        vst4.32         {d21, d23, d25, d27}, [r0]!
  
  .L_check_vec4:
       @ check if anything left to process at the end of the input array
@@ -438,31 +382,22 @@ vmul_vec4f_neon:
  
  .L_secondloop_vec4:
       @ process the last few items left in the input array
-        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
-                                                                  @      q0 = { V1.x, -, -, - };
-                                                                  @      q1 = { V1.y, -, -, - };
-                                                                  @      q2 = { V1.z, -, -, - };
-                                                                  @      q3 = { V1.w, -, -, - };
-        vld4.f32          {d1[0], d3[0], d5[0], d7[0]}, [r2]!     @ The values are loaded like so:
-                                                                  @      q0 = { V1.x, -, V2.x, - };
-                                                                  @      q1 = { V1.y, -, V2.y, - };
-                                                                  @      q2 = { V1.z, -, V2.z, - };
-                                                                  @      q3 = { V1.w, -, V2.w, - };
+        vld1.f32          {d0, d1}, [r1]!     @ The values are loaded like so:
+                                                                  @      q0 = { V1.x, V1.y, V1.z, V1.w };
+        vld1.f32          {d2, d3}, [r2]!     @ The values are loaded like so:
+                                                                  @      q1 = { V2.x, V2.y, V2.z, V2.w };
  
          subs              r4, r4, #1
  
          @ calculate values
-        vmul.f32          d0, d0, d1
-        vmul.f32          d2, d2, d3
-        vmul.f32          d4, d4, d5
-        vmul.f32          d6, d6, d7
+        vmul.f32          q0, q0, q1
  
-        vst4.32          {d0[0], d2[0], d4[0], d6[0]}, [r0]!
+        vst1.32          {d0, d1}, [r0]!
  
          bgt               .L_secondloop_vec4
  
  .L_return_vec4:
-     @ return
+        @ return
          pop               {r4}
          mov               r0, #0
          bx                lr
author	yang <yang.zhang@arm.com>
	Fri, 20 Jul 2012 02:58:03 +0000 (10:58 +0800)
committer	yang <yang.zhang@arm.com>
	Fri, 20 Jul 2012 02:58:03 +0000 (10:58 +0800)