From 91e7e0eb445f3363f72d795b9caaac9205404dd2 Mon Sep 17 00:00:00 2001
From: Ramin Zaghi <ramin@arm.com>
Date: Tue, 3 Apr 2012 10:52:18 +0000
Subject: [PATCH] the new vectorized division routines.

---
 source/NE10_div.c      |  30 +++
 source/NE10_div.neon.c |  46 ----
 source/NE10_div.neon.s | 640 +++++++++++++++++++++++++++++++++++++++++++++++++
 source/NE10_div_test.c |  34 ++-
 4 files changed, 701 insertions(+), 49 deletions(-)
 delete mode 100644 source/NE10_div.neon.c
 create mode 100644 source/NE10_div.neon.s

diff --git a/source/NE10_div.c b/source/NE10_div.c
index 1348de0..281a42d 100644
--- a/source/NE10_div.c
+++ b/source/NE10_div.c
@@ -30,3 +30,33 @@ arm_result_t div_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * sr
     dst[ itr ] = src1[ itr ] / src2[ itr ];
   );
 }
+
+arm_result_t vdiv_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
+    dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
+  );
+}
+
+arm_result_t vdiv_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
+    dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
+    dst[ itr ].z = src1[ itr ].z / src2[ itr ].z;
+  );
+}
+
+arm_result_t vdiv_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
+    dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
+    dst[ itr ].z = src1[ itr ].z / src2[ itr ].z;
+    dst[ itr ].w = src1[ itr ].w / src2[ itr ].w;
+  );
+}
diff --git a/source/NE10_div.neon.c b/source/NE10_div.neon.c
deleted file mode 100644
index df90a6a..0000000
--- a/source/NE10_div.neon.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright 2011-12 ARM Limited
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_div.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
- #include <stdio.h>
- #include <stdlib.h>
-
-arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
-{
-    NE10_X_OPERATION_FLOAT_NEON
-    (
-      /* a single division operation */
-      float32x4_t rec = vrecpeq_f32( n_src2 );
-      rec = vmulq_f32(vrecpsq_f32(n_src2, rec), rec);
-      rec = vmulq_f32(vrecpsq_f32(n_src2, rec), rec);
-      n_dst = vmulq_f32( n_src , rec );
-      ,
-      /* a single division operation */
-      float32x2_t rec = vrecpe_f32( n_tmp_src2 );
-      rec = vmul_f32(vrecps_f32(n_tmp_src2, rec), rec);
-      rec = vmul_f32(vrecps_f32(n_tmp_src2, rec), rec);
-      n_tmp_src = vmul_f32( n_tmp_src, rec );
-    );
-}
diff --git a/source/NE10_div.neon.s b/source/NE10_div.neon.s
new file mode 100644
index 0000000..039f349
--- /dev/null
+++ b/source/NE10_div.neon.s
@@ -0,0 +1,640 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_div.neon.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+
+
+
+        .balign   4
+        .global   div_float_neon
+        .thumb
+        .thumb_func
+
+div_float_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t div_float(arm_float_t * dst,
+        @                 arm_float_t * src1,
+        @                 arm_float_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r4; This is what's left to be processed after this loop
+
+        cbz               r3, .L_check_float
+
+        @ load the 1st set of values
+          vld1.32         {q0}, [r1]!
+          vld1.32         {q1}, [r2]!
+          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+
+        @ calculate values for the 1st set
+          vrecpe.f32 q3, q1
+          vrecps.f32 q1, q3, q1
+          vmul.f32   q3, q1, q3
+          vmul.f32   q3, q0, q3
+
+
+        @ load the 2nd set of values
+          vld1.32         {q0}, [r1]!
+          vld1.32         {q1}, [r2]!
+
+          ble             .L_mainloopend_float
+
+.L_mainloop_float:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst1.32         {d6,d7}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          vrecpe.f32 q3, q1
+          vrecps.f32 q1, q3, q1
+          vmul.f32   q3, q1, q3
+          vmul.f32   q3, q0, q3
+
+       @ load the next (e.g. 3rd) set of values
+          vld1.32         {q0}, [r1]!
+          vld1.32         {q1}, [r2]!
+          subs            r3, r3, #4
+
+        bgt             .L_mainloop_float             @ loop if r2 is > r3, if we have at least another 4 floats
+
+.L_mainloopend_float:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst1.32         {d6,d7}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          vrecpe.f32 q3, q1
+          vrecps.f32 q1, q3, q1
+          vmul.f32   q3, q1, q3
+          vmul.f32   q3, q0, q3
+
+        @ store the result for the last (e.g. 3rd) set
+          vst1.32         {d6,d7}, [r0]!
+
+
+.L_check_float:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_float
+
+.L_secondloop_float:
+     @ process the last few items left in the input array
+        vld1.f32          d0[0], [r1]!           @ Fill in d0[0]
+        vld1.f32          d1[0], [r2]!           @ Fill in d1[1]
+
+
+        subs              r4, r4, #1
+
+        @ values d0 = d0 / d1
+          vrecpe.f32 d3, d1
+          vrecps.f32 d1, d3, d1
+          vmul.f32   d3, d1, d3
+          vmul.f32   d0, d0, d3
+
+        vst1.32           {d0[0]}, [r0]!
+
+        bgt               .L_secondloop_float
+
+.L_return_float:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .balign   4
+        .global   vdiv_vec2f_neon
+        .thumb
+        .thumb_func
+
+vdiv_vec2f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t div_float(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src1,
+        @                 arm_vec2f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_vec2
+
+        @ load the 1st set of values
+          vld2.32         {q0-q1}, [r1]!
+          vld2.32         {q2-q3}, [r2]!
+          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+
+        @ calculate values for the 1st set
+        @ q8 = q0 / q2
+          vrecpe.f32 q8, q2
+          vrecps.f32 q2, q8, q2
+          vmul.f32   q8, q2, q8
+          vmul.f32   q8, q0, q8
+
+        @ q9 = q1 / q3
+          vrecpe.f32 q9, q3
+          vrecps.f32 q3, q9, q3
+          vmul.f32   q9, q3, q9
+          vmul.f32   q9, q1, q9
+
+        @ load the 2nd set of values
+          vld2.32         {q0-q1}, [r1]!
+          vld2.32         {q2-q3}, [r2]!
+
+          ble             .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst2.32         {d16,d17,d18,d19}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+        @ q8 = q0 / q2
+          vrecpe.f32 q8, q2
+          vrecps.f32 q2, q8, q2
+          vmul.f32   q8, q2, q8
+          vmul.f32   q8, q0, q8
+
+        @ q9 = q1 / q3
+          vrecpe.f32 q9, q3
+          vrecps.f32 q3, q9, q3
+          vmul.f32   q9, q3, q9
+          vmul.f32   q9, q1, q9
+
+       @ load the next (e.g. 3rd) set of values
+          vld2.32         {q0-q1}, [r1]!
+          vld2.32         {q2-q3}, [r2]!
+          subs            r3, r3, #4
+
+        bgt             .L_mainloop_vec2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst2.32         {d16,d17,d18,d19}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+        @ q8 = q0 / q2
+          vrecpe.f32 q8, q2
+          vrecps.f32 q2, q8, q2
+          vmul.f32   q8, q2, q8
+          vmul.f32   q8, q0, q8
+
+        @ q9 = q1 / q3
+          vrecpe.f32 q9, q3
+          vrecps.f32 q3, q9, q3
+          vmul.f32   q9, q3, q9
+          vmul.f32   q9, q1, q9
+
+        @ store the result for the last (e.g. 3rd) set
+          vst2.32         {d16,d17,d18,d19}, [r0]!
+
+.L_check_vec2:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_vec2
+
+.L_secondloop_vec2:
+     @ process the last few items left in the input array
+        vld1.f32          d0, [r1]!
+        vld1.f32          d1, [r2]!
+
+        subs              r4, r4, #1
+
+        @ calculate values
+        @ d0 = d0 / d1
+          vrecpe.f32 d4, d1
+          vrecps.f32 d1, d4, d1
+          vmul.f32   d4, d1, d4
+          vmul.f32   d0, d0, d4
+
+        vst1.32           {d0}, [r0]!
+
+        bgt               .L_secondloop_vec2
+
+.L_return_vec2:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global vdiv_vec3f_neon
+        .thumb
+        .thumb_func
+vdiv_vec3f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t div_float(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src1,
+        @                 arm_vec3f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r3 = count % 4;
+        sub               r3, r3, r4          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_vec3
+
+        @ load the 1st set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          vld3.32         {d18, d20, d22}, [r2]!
+          vld3.32         {d19, d21, d23}, [r2]!
+          subs            r3, r3, #8          @ 4 for this set, and 4 for the 2nd set
+
+        @ calculate values for the 1st set
+          @  q12 = q0 / q9
+          vrecpe.f32 q12, q9
+          vrecps.f32 q9 , q12, q9
+          vmul.f32   q12, q9 , q12
+          vmul.f32   q12, q0 , q12
+
+          @  q13 = q1 / q10
+          vrecpe.f32 q13, q10
+          vrecps.f32 q10 , q13, q10
+          vmul.f32   q13, q10 , q13
+          vmul.f32   q13, q1 , q13
+
+          @  q14 = q2 / q11
+          vrecpe.f32 q14, q11
+          vrecps.f32 q11 , q14, q11
+          vmul.f32   q14, q11 , q14
+          vmul.f32   q14, q2 , q14
+
+        @ load the 2nd set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          vld3.32         {d18, d20, d22}, [r2]!
+          vld3.32         {d19, d21, d23}, [r2]!
+
+          ble             .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst3.32         {d24, d26, d28}, [r0]!
+          vst3.32         {d25, d27, d29}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          @  q12 = q0 / q9
+          vrecpe.f32 q12, q9
+          vrecps.f32 q9 , q12, q9
+          vmul.f32   q12, q9 , q12
+          vmul.f32   q12, q0 , q12
+
+          @  q13 = q1 / q10
+          vrecpe.f32 q13, q10
+          vrecps.f32 q10 , q13, q10
+          vmul.f32   q13, q10 , q13
+          vmul.f32   q13, q1 , q13
+
+          @  q14 = q2 / q11
+          vrecpe.f32 q14, q11
+          vrecps.f32 q11 , q14, q11
+          vmul.f32   q14, q11 , q14
+          vmul.f32   q14, q2 , q14
+
+       @ load the next (e.g. 3rd) set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          vld3.32         {d18, d20, d22}, [r2]!
+          vld3.32         {d19, d21, d23}, [r2]!
+          subs            r3, r3, #4
+
+        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst3.32         {d24, d26, d28}, [r0]!
+          vst3.32         {d25, d27, d29}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          @  q12 = q0 / q9
+          vrecpe.f32 q12, q9
+          vrecps.f32 q9 , q12, q9
+          vmul.f32   q12, q9 , q12
+          vmul.f32   q12, q0 , q12
+
+          @  q13 = q1 / q10
+          vrecpe.f32 q13, q10
+          vrecps.f32 q10 , q13, q10
+          vmul.f32   q13, q10 , q13
+          vmul.f32   q13, q1 , q13
+
+          @  q14 = q2 / q11
+          vrecpe.f32 q14, q11
+          vrecps.f32 q11 , q14, q11
+          vmul.f32   q14, q11 , q14
+          vmul.f32   q14, q2 , q14
+
+        @ store the result for the last (e.g. 3rd) set
+          vst3.32         {d24, d26, d28}, [r0]!
+          vst3.32         {d25, d27, d29}, [r0]!
+
+.L_check_vec3:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_vec3
+
+.L_secondloop_vec3:
+     @ process the last few items left in the input array
+        vld3.f32          {d0[0], d2[0], d4[0]}, [r1]!     @ The values are loaded like so:
+                                                           @      q0 = { V1.x, -, -, - };
+                                                           @      q1 = { V1.y, -, -, - };
+                                                           @      q2 = { V1.z, -, -, - };
+        vld3.f32          {d1[0], d3[0], d5[0]}, [r2]!     @ The values are loaded like so:
+                                                           @      q0 = { V1.x, -, V2.x, - };
+                                                           @      q1 = { V1.y, -, V2.y, - };
+                                                           @      q2 = { V1.z, -, V2.z, - };
+
+        subs              r4, r4, #1
+
+        @ calculate values for
+          vrecpe.f32 d18, d1
+          vrecps.f32 d1 , d18, d1
+          vmul.f32   d18, d1 , d18
+          vmul.f32   d0 , d0 , d18
+
+          vrecpe.f32 d20, d3
+          vrecps.f32 d3 , d20, d3
+          vmul.f32   d20, d3 , d20
+          vmul.f32   d2 , d2 , d20
+
+          vrecpe.f32 d22, d5
+          vrecps.f32 d5 , d22, d5
+          vmul.f32   d22, d5 , d22
+          vmul.f32   d4 , d4 , d22
+
+        vst3.32           {d0[0], d2[0], d4[0]}, [r0]!
+
+        bgt               .L_secondloop_vec3
+
+.L_return_vec3:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global vdiv_vec4f_neon
+        .thumb
+        .thumb_func
+vdiv_vec4f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t div_float(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src1,
+        @                 arm_vec4f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r4; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_vec4
+
+        @ load the 1st set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          vld4.32         {d16, d18, d20, d22}, [r2]!
+          vld4.32         {d17, d19, d21, d23}, [r2]!
+
+          subs            r3, r3, #8           @ 4 for this set, and 4 for the 2nd set
+
+        @ calculate values for the 1st set
+          @  q12 = q0 / q8
+          vrecpe.f32 q12, q8
+          vrecps.f32 q8 , q12, q8
+          vmul.f32   q12, q8 , q12
+          vmul.f32   q12, q0 , q12
+
+          @  q13 = q1 / q9
+          vrecpe.f32 q13, q9
+          vrecps.f32 q9 , q13, q9
+          vmul.f32   q13, q9 , q13
+          vmul.f32   q13, q1 , q13
+
+          @  q14 = q2 / q10
+          vrecpe.f32 q14, q10
+          vrecps.f32 q10 , q14, q10
+          vmul.f32   q14, q10 , q14
+          vmul.f32   q14, q2 , q14
+
+          @  q15 = q3 / q11
+          vrecpe.f32 q15, q11
+          vrecps.f32 q11 , q15, q11
+          vmul.f32   q15, q11 , q15
+          vmul.f32   q15, q3 , q15
+
+        @ load the 2nd set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          vld4.32         {d16, d18, d20, d22}, [r2]!
+          vld4.32         {d17, d19, d21, d23}, [r2]!
+
+          ble             .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst4.32         {d24, d26, d28, d30}, [r0]!
+          vst4.32         {d25, d27, d29, d31}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          @  q12 = q0 / q8
+          vrecpe.f32 q12, q8
+          vrecps.f32 q8 , q12, q8
+          vmul.f32   q12, q8 , q12
+          vmul.f32   q12, q0 , q12
+
+          @  q13 = q1 / q9
+          vrecpe.f32 q13, q9
+          vrecps.f32 q9 , q13, q9
+          vmul.f32   q13, q9 , q13
+          vmul.f32   q13, q1 , q13
+
+          @  q14 = q2 / q10
+          vrecpe.f32 q14, q10
+          vrecps.f32 q10 , q14, q10
+          vmul.f32   q14, q10 , q14
+          vmul.f32   q14, q2 , q14
+
+          @  q15 = q3 / q11
+          vrecpe.f32 q15, q11
+          vrecps.f32 q11 , q15, q11
+          vmul.f32   q15, q11 , q15
+          vmul.f32   q15, q3 , q15
+
+       @ load the next (e.g. 3rd) set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          vld4.32         {d16, d18, d20, d22}, [r2]!
+          vld4.32         {d17, d19, d21, d23}, [r2]!
+          subs            r3, r3, #4
+
+        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_vec4:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst4.32         {d24, d26, d28, d30}, [r0]!
+          vst4.32         {d25, d27, d29, d31}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          @  q12 = q0 / q8
+          vrecpe.f32 q12, q8
+          vrecps.f32 q8 , q12, q8
+          vmul.f32   q12, q8 , q12
+          vmul.f32   q12, q0 , q12
+
+          @  q13 = q1 / q9
+          vrecpe.f32 q13, q9
+          vrecps.f32 q9 , q13, q9
+          vmul.f32   q13, q9 , q13
+          vmul.f32   q13, q1 , q13
+
+          @  q14 = q2 / q10
+          vrecpe.f32 q14, q10
+          vrecps.f32 q10 , q14, q10
+          vmul.f32   q14, q10 , q14
+          vmul.f32   q14, q2 , q14
+
+          @  q15 = q3 / q11
+          vrecpe.f32 q15, q11
+          vrecps.f32 q11 , q15, q11
+          vmul.f32   q15, q11 , q15
+          vmul.f32   q15, q3 , q15
+
+        @ store the result for the last (e.g. 3rd) set
+          vst4.32         {d24, d26, d28, d30}, [r0]!
+          vst4.32         {d25, d27, d29, d31}, [r0]!
+
+.L_check_vec4:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_vec4
+
+.L_secondloop_vec4:
+     @ process the last few items left in the input array
+        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
+                                                                  @      q0 = { V1.x, -, -, - };
+                                                                  @      q1 = { V1.y, -, -, - };
+                                                                  @      q2 = { V1.z, -, -, - };
+                                                                  @      q3 = { V1.w, -, -, - };
+        vld4.f32          {d1[0], d3[0], d5[0], d7[0]}, [r2]!     @ The values are loaded like so:
+                                                                  @      q0 = { V1.x, -, V2.x, - };
+                                                                  @      q1 = { V1.y, -, V2.y, - };
+                                                                  @      q2 = { V1.z, -, V2.z, - };
+                                                                  @      q3 = { V1.w, -, V2.w, - };
+
+        subs              r4, r4, #1
+
+        @ calculate values
+          @  d0 = d0 / d1
+          vrecpe.f32 d18, d1
+          vrecps.f32 d1 , d18, d1
+          vmul.f32   d18, d1 , d18
+          vmul.f32   d0 , d0 , d18
+
+          @  d2 = d2 / d3
+          vrecpe.f32 d20, d3
+          vrecps.f32 d3 , d20, d3
+          vmul.f32   d20, d3 , d20
+          vmul.f32   d2 , d2 , d20
+
+          @  d4 = d4 / d5
+          vrecpe.f32 d22, d5
+          vrecps.f32 d5 , d22, d5
+          vmul.f32   d22, d5 , d22
+          vmul.f32   d4 , d4 , d22
+
+          @  d6 = d6 / d7
+          vrecpe.f32 d16, d7
+          vrecps.f32 d7 , d16, d7
+          vmul.f32   d16, d7 , d16
+          vmul.f32   d6 , d6 , d16
+
+        vst4.32          {d0[0], d2[0], d4[0], d6[0]}, [r0]!
+
+        bgt               .L_secondloop_vec4
+
+.L_return_vec4:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
diff --git a/source/NE10_div_test.c b/source/NE10_div_test.c
index 8527f97..3a605fc 100644
--- a/source/NE10_div_test.c
+++ b/source/NE10_div_test.c
@@ -23,20 +23,48 @@
 // length of the data arrays
 #define ARRLEN TEST_ARRLEN
 // number of the operations in a given unit
-#define OP_COUNT 1
+#define OP_COUNT 4
 // number of the different implementations of each of the functions (C, ASM, NEON, ...)
 #define IMPL_COUNT 3
 
 
-#include "../headers/unit_test_x_operation_x.h"
+#include "../headers/unit_test_x_operation_x_tolerant.h"
+
+extern arm_result_t div_float_c   (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+//extern arm_result_t div_float_asm (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use
+extern arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+
+extern arm_result_t vdiv_vec2f_c   (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+//extern arm_result_t vdiv_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+
+extern arm_result_t vdiv_vec3f_c   (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+//extern arm_result_t vdiv_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+
+extern arm_result_t vdiv_vec4f_c   (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+//extern arm_result_t vdiv_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 void init_ftbl()
 {
    // manually initialize the global function table with
    //  those functions that do have an actual implementation.
    ftbl[ 0] = (arm_func_4args_t) div_float_c;
-   ftbl[ 1] = (arm_func_4args_t) div_float_asm;
+   ftbl[ 1] = (arm_func_4args_t) div_float_c; // using the c version in place of the assembly version
    ftbl[ 2] = (arm_func_4args_t) div_float_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) vdiv_vec2f_c;
+   ftbl[ 4] = (arm_func_4args_t) vdiv_vec2f_c; // using the c version in place of the assembly version
+   ftbl[ 5] = (arm_func_4args_t) vdiv_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) vdiv_vec3f_c;
+   ftbl[ 7] = (arm_func_4args_t) vdiv_vec3f_c; // using the c version in place of the assembly version
+   ftbl[ 8] = (arm_func_4args_t) vdiv_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_4args_t) vdiv_vec4f_c;
+   ftbl[10] = (arm_func_4args_t) vdiv_vec4f_c; // using the c version in place of the assembly version
+   ftbl[11] = (arm_func_4args_t) vdiv_vec4f_neon;
 }
 
 arm_result_t main( int argc, char **argv )
-- 
2.7.4