From 8887904eeea5ec6a26b223f3fe3b5d48495689fa Mon Sep 17 00:00:00 2001
From: Ramin Zaghi <ramin@arm.com>
Date: Tue, 3 Apr 2012 10:16:03 +0000
Subject: [PATCH] preparing the code base for adding new routines.

---
 Makefile                                  |   2 +-
 headers/macros.h                          |   2 +
 headers/unit_test_abs_operation_x.h       |   2 +-
 headers/unit_test_common.h                |   3 +-
 headers/unit_test_len_operation_x.h       |   2 +-
 headers/unit_test_mla_operation_x.h       |   2 +-
 headers/unit_test_mlac_operation_x.h      |   2 +-
 headers/unit_test_normalize_operation_x.h |   2 +-
 headers/unit_test_setc_operation_x.h      |   2 +-
 headers/unit_test_x_operation_x.h         |   2 +-
 headers/unit_test_xc_operation_x.h        |   2 +-
 headers/versionheader.h                   |   4 +-
 headers/versionheader.s                   |   4 +-
 inc/NE10.h                                | 155 ++++++++++++++++++++++
 inc/NE10_asm.h                            |   6 +-
 inc/NE10_c.h                              |   6 +-
 inc/NE10_neon.h                           |   6 +-
 17 files changed, 181 insertions(+), 23 deletions(-)

diff --git a/Makefile b/Makefile
index 6a15ab7..1080b86 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ OPTIMIZE_FLAGS = -O3
 LDFLAGS+=-L.  -L/usr/local/lib -L/client/lib -L/lib/arm-linux-gnueabi
 LDFLAGS+=-lm
 
-ALLFILES = NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o
+ALLFILES = NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o NE10_dot.c_r.o NE10_dot.neon_r.o NE10_cross.c_r.o NE10_cross.neon_r.o
 
 #TARGET_ARCH = stdc
 
diff --git a/headers/macros.h b/headers/macros.h
index a8ae6d3..185b1a8 100644
--- a/headers/macros.h
+++ b/headers/macros.h
@@ -165,6 +165,8 @@
    ); \
   }
 
+#define NE10_DOT_OPERATION_X_C NE10_X_OPERATION_FLOAT_C
+
 ///// The "DstSrc" group of functions //////
 
 #define NE10_ABS_OPERATION_X_C(loopCode) { \
diff --git a/headers/unit_test_abs_operation_x.h b/headers/unit_test_abs_operation_x.h
index 93d511c..94d4ce7 100644
--- a/headers/unit_test_abs_operation_x.h
+++ b/headers/unit_test_abs_operation_x.h
@@ -174,7 +174,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ACCEPTABLE_ERROR ) )
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ERROR_MARGIN_SMALL ) )
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/unit_test_common.h b/headers/unit_test_common.h
index 4648170..40e91d3 100644
--- a/headers/unit_test_common.h
+++ b/headers/unit_test_common.h
@@ -69,7 +69,8 @@
 #define EXPONENT_MASK 0x807FFFFF
 
 // What's the acceptable error between the integer representations of two float values
-#define ACCEPTABLE_ERROR 2
+#define ERROR_MARGIN_SMALL 0x02
+#define ERROR_MARGIN_LARGE 0xFF
 
 // What's the acceptable number of warnings in a test
 #define ACCEPTABLE_WARNS 10
diff --git a/headers/unit_test_len_operation_x.h b/headers/unit_test_len_operation_x.h
index ebc8b13..98e1e07 100644
--- a/headers/unit_test_len_operation_x.h
+++ b/headers/unit_test_len_operation_x.h
@@ -176,7 +176,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , 0xFF ) ) // accept larger errors as we're doing a single step
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ERROR_MARGIN_LARGE ) ) // accept larger errors as we're doing a single step
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/unit_test_mla_operation_x.h b/headers/unit_test_mla_operation_x.h
index 77bf46a..abe2b70 100644
--- a/headers/unit_test_mla_operation_x.h
+++ b/headers/unit_test_mla_operation_x.h
@@ -190,7 +190,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_LARGE ) )
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/unit_test_mlac_operation_x.h b/headers/unit_test_mlac_operation_x.h
index 71333b8..5b7dc6e 100644
--- a/headers/unit_test_mlac_operation_x.h
+++ b/headers/unit_test_mlac_operation_x.h
@@ -214,7 +214,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/unit_test_normalize_operation_x.h b/headers/unit_test_normalize_operation_x.h
index 59b9e36..75307f7 100644
--- a/headers/unit_test_normalize_operation_x.h
+++ b/headers/unit_test_normalize_operation_x.h
@@ -177,7 +177,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , 0xFF ) ) // accept larger errors as we're doing a single step
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ERROR_MARGIN_LARGE ) ) // accept larger errors as we're doing a single step
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/unit_test_setc_operation_x.h b/headers/unit_test_setc_operation_x.h
index 85d6519..3c1d1cd 100644
--- a/headers/unit_test_setc_operation_x.h
+++ b/headers/unit_test_setc_operation_x.h
@@ -195,7 +195,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/unit_test_x_operation_x.h b/headers/unit_test_x_operation_x.h
index 7c6aa46..0a969f3 100644
--- a/headers/unit_test_x_operation_x.h
+++ b/headers/unit_test_x_operation_x.h
@@ -182,7 +182,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/unit_test_xc_operation_x.h b/headers/unit_test_xc_operation_x.h
index 6436e21..2bc81f6 100644
--- a/headers/unit_test_xc_operation_x.h
+++ b/headers/unit_test_xc_operation_x.h
@@ -203,7 +203,7 @@ arm_result_t run_test( int argc, char **argv )
                          assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
                          assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
 
-                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
                          { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
                                     opcode, impl, i, pos+1 );
                              warns++; }
diff --git a/headers/versionheader.h b/headers/versionheader.h
index b07a91a..09a9977 100644
--- a/headers/versionheader.h
+++ b/headers/versionheader.h
@@ -23,9 +23,9 @@
 /////////////////////////////////////////////////////////
 
 #define VERSION_MAJOR      0
-#define VERSION_MINOR      0
+#define VERSION_MINOR      9
 #define VERSION_REVISION   10
 
 #define PHASE              1
-#define COPYRIGHT_YEAR     2011
+#define COPYRIGHT_YEAR     2012
 #define COPYRIGHT_HOLDER   "ARM Ltd."
diff --git a/headers/versionheader.s b/headers/versionheader.s
index 17a6fde..fcd55f5 100644
--- a/headers/versionheader.s
+++ b/headers/versionheader.s
@@ -23,11 +23,11 @@
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
         .equ      VERSION_MAJOR,    0
-        .equ      VERSION_MINOR,    0
+        .equ      VERSION_MINOR,    9
         .equ      VERSION_REVISION, 10
 
         .equ      PHASE,            1
-        .equ      COPYRIGHT_YEAR,   2011
+        .equ      COPYRIGHT_YEAR,   2012
 
 COPYRIGHT_HOLDER:
         .asciz                      "ARM Ltd."
diff --git a/inc/NE10.h b/inc/NE10.h
index 99bdee9..9b7bc6a 100644
--- a/inc/NE10.h
+++ b/inc/NE10.h
@@ -391,54 +391,209 @@ extern arm_result_t (*normalize_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, uns
 
 
 
+
+/*!
+    Generates a 2D vector from the absolute values of each of the components of an input vector
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
 extern arm_result_t (*abs_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+/*!
+    Generates a 3D vector from the absolute values of each of the components of an input vector
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
 extern arm_result_t (*abs_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+/*!
+    Generates a 4D vector from the absolute values of each of the components of an input vector
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
 extern arm_result_t (*abs_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
 
 
 
 // ## SIMD Component-wise Arithmetic on Two Vectors ##
+
+/*!
+    Multiplies the components of a 2D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vmul_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+/*!
+    Multiplies the components of a 3D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vmul_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+/*!
+    Multiplies the components of a 4D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vmul_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
+/*!
+    Divides the components of a 2D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the nominators' source array
+    @param[in]  src2   Pointer to the denominators' source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vdiv_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+/*!
+    Divides the components of a 3D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the nominators' source array
+    @param[in]  src2   Pointer to the denominators' source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vdiv_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+/*!
+    Divides the components of a 4D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the nominators' source array
+    @param[in]  src2   Pointer to the denominators' source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vdiv_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
+/*!
+    Performs a multiply and accumulate operation on the components of a 2D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vmla_vec2f)(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+/*!
+    Performs a multiply and accumulate operation on the components of a 3D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vmla_vec3f)(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+/*!
+    Performs a multiply and accumulate operation on the components of a 4D vector with the corresponding components of another
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*vmla_vec4f)(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
 // ## Vector-Vector Algebra ##
+
+/*!
+    Vector addition of two 2D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*add_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+/*!
+    Vector addition of two 3D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*add_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+/*!
+    Vector addition of two 4D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*add_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
+/*!
+    Vector subtraction of two 2D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*sub_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+/*!
+    Vector subtraction of two 3D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*sub_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+/*!
+    Vector subtraction of two 4D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*sub_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
+/*!
+    Dot product of two 2D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*dot_vec2f)(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+/*!
+    Dot product of two 3D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*dot_vec3f)(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+/*!
+    Dot product of two 4D vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*dot_vec4f)(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
+/*!
+    Performs a cross product operation on the two input vectors
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1   Pointer to the first source array
+    @param[in]  src2   Pointer to the second source array
+    @param[in]  count The number of items in the input arrays
+ */
 extern arm_result_t (*cross_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
 
 
 
+
 // ## Matrix-Constant Arithmetic ##
 
 // arm_mat4x4f_t
diff --git a/inc/NE10_asm.h b/inc/NE10_asm.h
index ae1ef16..2b56762 100644
--- a/inc/NE10_asm.h
+++ b/inc/NE10_asm.h
@@ -119,9 +119,9 @@ extern arm_result_t vdiv_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_ve
 
 
 
-extern arm_result_t vmla_vec2f_asm(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec3f_asm(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec4f_asm(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
diff --git a/inc/NE10_c.h b/inc/NE10_c.h
index 2c68fa8..7e52e72 100644
--- a/inc/NE10_c.h
+++ b/inc/NE10_c.h
@@ -117,9 +117,9 @@ extern arm_result_t vdiv_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4
 
 
 
-extern arm_result_t vmla_vec2f_c(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec3f_c(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec4f_c(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
diff --git a/inc/NE10_neon.h b/inc/NE10_neon.h
index 411a659..815e356 100644
--- a/inc/NE10_neon.h
+++ b/inc/NE10_neon.h
@@ -119,9 +119,9 @@ extern arm_result_t vdiv_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_v
 
 
 
-extern arm_result_t vmla_vec2f_neon(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec3f_neon(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec4f_neon(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
 
 
 
-- 
2.34.1