From: Ramin Zaghi <ramin@arm.com>
Date: Wed, 4 Apr 2012 11:28:48 +0000 (+0000)
Subject: New functions: Matrix multiplication routines.
X-Git-Tag: v1.0.0~54^2~1
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=67ed64b8cb42caf97c2d92f196ef48ffac38ce9b;p=platform%2Fupstream%2Fne10.git

New functions: Matrix multiplication routines.
---

diff --git a/NE10_init.c b/NE10_init.c
index c6108db..4d3ddeb 100644
--- a/NE10_init.c
+++ b/NE10_init.c
@@ -125,6 +125,9 @@ arm_result_t NE10_init()
       submat_2x2f = submat_2x2f_neon;
       submat_3x3f = submat_3x3f_neon;
       submat_4x4f = submat_4x4f_neon;
+      mulmat_2x2f = mulmat_2x2f_neon;
+      mulmat_3x3f = mulmat_3x3f_neon;
+      mulmat_4x4f = mulmat_4x4f_neon;
     }
     else
     {
@@ -198,6 +201,9 @@ arm_result_t NE10_init()
       submat_2x2f = submat_2x2f_c;
       submat_3x3f = submat_3x3f_c;
       submat_4x4f = submat_4x4f_c;
+      mulmat_2x2f = mulmat_2x2f_c;
+      mulmat_3x3f = mulmat_3x3f_c;
+      mulmat_4x4f = mulmat_4x4f_c;
     }
 }
 
@@ -272,4 +278,8 @@ arm_result_t (*addmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x
 arm_result_t (*submat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
 arm_result_t (*submat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
 arm_result_t (*submat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+arm_result_t (*mulmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+arm_result_t (*mulmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+arm_result_t (*mulmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+
 
diff --git a/headers/NE10_random.h b/headers/NE10_random.h
index 9253312..686553d 100644
--- a/headers/NE10_random.h
+++ b/headers/NE10_random.h
@@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <float.h>
+#include <math.h>
 
 // Please look at http://en.wikipedia.org/wiki/Linear_congruential_generator
 // According to this page, these values are the ones used in "glibc"
@@ -170,8 +171,8 @@ float NE10_float_rng_max()
 
 // the same as above functions except the range of values are limited
 
-#define IS_TOO_SMALL(f) ((f<1.0e-6)?1:0)
-#define   IS_TOO_BIG(f) ((f>1.0e12)?1:0)
+#define IS_TOO_SMALL(f) ((fabs(f)<1.0e-6)?1:0)
+#define   IS_TOO_BIG(f) ((fabs(f)>1.0e12)?1:0)
 
 static NE10_float_rng_t __NE10_float_rng_limit; // local array for internal use only
 
@@ -197,5 +198,34 @@ float NE10_float_rng_limit_max()
   return NE10_float_rng_max_g(NULL);
 }
 
+// the same as above functions except the range of values are limited and all the values are greater than 1.0e-6
+
+#define IS_TOO_SMALL_GT1(f) ((fabs(f)<1.0e-6)?1:0)
+#define   IS_TOO_BIG_GT1(f) ((fabs(f)>1.0e+3)?1:0)
+
+static NE10_float_rng_t __NE10_float_rng_limit_gt1; // local array for internal use only
+
+void NE10_float_rng_limit_gt1_init(uint32_t seed)
+{
+   NE10_float_rng_init_g( &__NE10_float_rng_limit , seed );
+}
+
+float NE10_float_rng_limit_gt1_next()
+{
+   float ret = 0.0f;
+
+   do
+   {
+      ret = NE10_float_rng_next_g( &__NE10_float_rng_limit );
+   } while ( IS_TOO_SMALL_GT1(ret) || IS_TOO_BIG_GT1(ret) );
+
+   return ret;
+}
+
+float NE10_float_rng_limit_gt1_max()
+{
+  return NE10_float_rng_max_g(NULL);
+}
+
 #endif // NE10_RANDOM
 
diff --git a/headers/unit_test_common.h b/headers/unit_test_common.h
index 6e13412..de55e41 100644
--- a/headers/unit_test_common.h
+++ b/headers/unit_test_common.h
@@ -77,7 +77,7 @@
 #define ACCEPTABLE_WARNS 12
 #define ACCEPTABLE_WARNS_MATRICES 48
 
-inline void FILL_FLOAT_ARRAY( float *arr, unsigned int count )
+inline void FILL_FLOAT_ARRAY( arm_float_t *arr, unsigned int count )
 {
     unsigned int i = 0;
 
@@ -91,7 +91,7 @@ inline void FILL_FLOAT_ARRAY( float *arr, unsigned int count )
     }
 }
 
-inline void FILL_FLOAT_ARRAY_LIMIT( float *arr, unsigned int count )
+inline void FILL_FLOAT_ARRAY_LIMIT( arm_float_t *arr, unsigned int count )
 {
     unsigned int i = 0;
 
@@ -101,7 +101,21 @@ inline void FILL_FLOAT_ARRAY_LIMIT( float *arr, unsigned int count )
 
     for ( i = 0; i < count; i++ )
     {
-      arr[i] = NE10_float_rng_limit_next();
+        arr[ i ] = NE10_float_rng_limit_next();
+    }
+}
+
+inline void FILL_FLOAT_ARRAY_LIMIT_GT1( arm_float_t *arr, unsigned int count )
+{
+    unsigned int i = 0;
+
+    sleep ( 1 );
+
+    NE10_float_rng_limit_gt1_init( time(NULL) );
+
+    for ( i = 0; i < count; i++ )
+    {
+        arr[ i ] = NE10_float_rng_limit_gt1_next();
     }
 }
 
diff --git a/headers/unit_test_xmat_operation_x.h b/headers/unit_test_xmat_operation_x.h
index 0949b53..9ae0468 100644
--- a/headers/unit_test_xmat_operation_x.h
+++ b/headers/unit_test_xmat_operation_x.h
@@ -19,6 +19,7 @@
  */
 
 #include "./unit_test_common.h"
+#include "../inc/NE10_types.h"
 
 // This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
 typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count);
@@ -64,12 +65,12 @@ arm_result_t test_operation()
     guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
     GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
     thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
-    FILL_FLOAT_ARRAY( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+    FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
 
     guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
     GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
     thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
-    FILL_FLOAT_ARRAY( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+    FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
 
     for ( i = 0; i<IMPL_COUNT; i++ )
     {
diff --git a/projectfile b/projectfile
index 6f04780..da6c269 100644
--- a/projectfile
+++ b/projectfile
@@ -15,3 +15,4 @@ len
 normalize
 addmat
 submat
+mulmat
diff --git a/source/NE10_mulmat.asm.s b/source/NE10_mulmat.asm.s
new file mode 100644
index 0000000..f97ae79
--- /dev/null
+++ b/source/NE10_mulmat.asm.s
@@ -0,0 +1,19 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mulmat.asm.s
+@
diff --git a/source/NE10_mulmat.c b/source/NE10_mulmat.c
new file mode 100644
index 0000000..4a1fe86
--- /dev/null
+++ b/source/NE10_mulmat.c
@@ -0,0 +1,205 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_addmat.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t mulmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count)
+{
+  #define A1 src1[ itr ].c1.r1
+  #define A2 src2[ itr ].c1.r1
+  #define B1 src1[ itr ].c1.r2
+  #define B2 src2[ itr ].c1.r2
+  #define C1 src1[ itr ].c2.r1
+  #define C2 src2[ itr ].c2.r1
+  #define D1 src1[ itr ].c2.r2
+  #define D2 src2[ itr ].c2.r2
+
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].c1.r1 = (A1*A2)+(C1*B2);
+    dst[ itr ].c1.r2 = (B1*A2)+(D1*B2);
+
+    dst[ itr ].c2.r1 = (A1*C2)+(C1*D2);
+    dst[ itr ].c2.r2 = (B1*C2)+(D1*D2);
+  );
+
+  #undef A1
+  #undef A2
+  #undef B1
+  #undef B2
+  #undef C1
+  #undef C2
+  #undef D1
+  #undef D2
+}
+
+arm_result_t mulmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count)
+{
+  #define A1 src1[ itr ].c1.r1
+  #define A2 src2[ itr ].c1.r1
+  #define B1 src1[ itr ].c1.r2
+  #define B2 src2[ itr ].c1.r2
+  #define C1 src1[ itr ].c1.r3
+  #define C2 src2[ itr ].c1.r3
+  #define D1 src1[ itr ].c2.r1
+  #define D2 src2[ itr ].c2.r1
+  #define E1 src1[ itr ].c2.r2
+  #define E2 src2[ itr ].c2.r2
+  #define F1 src1[ itr ].c2.r3
+  #define F2 src2[ itr ].c2.r3
+  #define G1 src1[ itr ].c3.r1
+  #define G2 src2[ itr ].c3.r1
+  #define H1 src1[ itr ].c3.r2
+  #define H2 src2[ itr ].c3.r2
+  #define I1 src1[ itr ].c3.r3
+  #define I2 src2[ itr ].c3.r3
+
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].c1.r1 = (A1*A2)+(D1*B2)+(G1*C2);
+    dst[ itr ].c1.r2 = (B1*A2)+(E1*B2)+(H1*C2);
+    dst[ itr ].c1.r3 = (C1*A2)+(F1*B2)+(I1*C2);
+
+    dst[ itr ].c2.r1 = (A1*D2)+(D1*E2)+(G1*F2);
+    dst[ itr ].c2.r2 = (B1*D2)+(E1*E2)+(H1*F2);
+    dst[ itr ].c2.r3 = (C1*D2)+(F1*E2)+(I1*F2);
+
+    dst[ itr ].c3.r1 = (A1*G2)+(D1*H2)+(G1*I2);
+    dst[ itr ].c3.r2 = (B1*G2)+(E1*H2)+(H1*I2);
+    dst[ itr ].c3.r3 = (C1*G2)+(F1*H2)+(I1*I2);
+  );
+
+  #undef A1
+  #undef A2
+  #undef B1
+  #undef B2
+  #undef C1
+  #undef C2
+  #undef D1
+  #undef D2
+  #undef E1
+  #undef E2
+  #undef F1
+  #undef F2
+  #undef G1
+  #undef G2
+  #undef H1
+  #undef H2
+  #undef I1
+  #undef I2
+}
+
+arm_result_t mulmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count)
+{
+  #define A1 src1[ itr ].c1.r1
+  #define A2 src2[ itr ].c1.r1
+  #define B1 src1[ itr ].c1.r2
+  #define B2 src2[ itr ].c1.r2
+  #define C1 src1[ itr ].c1.r3
+  #define C2 src2[ itr ].c1.r3
+  #define D1 src1[ itr ].c1.r4
+  #define D2 src2[ itr ].c1.r4
+
+  #define E1 src1[ itr ].c2.r1
+  #define E2 src2[ itr ].c2.r1
+  #define F1 src1[ itr ].c2.r2
+  #define F2 src2[ itr ].c2.r2
+  #define G1 src1[ itr ].c2.r3
+  #define G2 src2[ itr ].c2.r3
+  #define H1 src1[ itr ].c2.r4
+  #define H2 src2[ itr ].c2.r4
+
+  #define I1 src1[ itr ].c3.r1
+  #define I2 src2[ itr ].c3.r1
+  #define J1 src1[ itr ].c3.r2
+  #define J2 src2[ itr ].c3.r2
+  #define K1 src1[ itr ].c3.r3
+  #define K2 src2[ itr ].c3.r3
+  #define L1 src1[ itr ].c3.r4
+  #define L2 src2[ itr ].c3.r4
+
+  #define M1 src1[ itr ].c4.r1
+  #define M2 src2[ itr ].c4.r1
+  #define N1 src1[ itr ].c4.r2
+  #define N2 src2[ itr ].c4.r2
+  #define O1 src1[ itr ].c4.r3
+  #define O2 src2[ itr ].c4.r3
+  #define P1 src1[ itr ].c4.r4
+  #define P2 src2[ itr ].c4.r4
+
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].c1.r1 = (A1*A2)+(E1*B2)+(I1*C2)+(M1*D2);
+    dst[ itr ].c1.r2 = (B1*A2)+(F1*B2)+(J1*C2)+(N1*D2);
+    dst[ itr ].c1.r3 = (C1*A2)+(G1*B2)+(K1*C2)+(O1*D2);
+    dst[ itr ].c1.r4 = (D1*A2)+(H1*B2)+(L1*C2)+(P1*D2);
+
+    dst[ itr ].c2.r1 = (A1*E2)+(E1*F2)+(I1*G2)+(M1*H2);
+    dst[ itr ].c2.r2 = (B1*E2)+(F1*F2)+(J1*G2)+(N1*H2);
+    dst[ itr ].c2.r3 = (C1*E2)+(G1*F2)+(K1*G2)+(O1*H2);
+    dst[ itr ].c2.r4 = (D1*E2)+(H1*F2)+(L1*G2)+(P1*H2);
+
+    dst[ itr ].c3.r1 = (A1*I2)+(E1*J2)+(I1*K2)+(M1*L2);
+    dst[ itr ].c3.r2 = (B1*I2)+(F1*J2)+(J1*K2)+(N1*L2);
+    dst[ itr ].c3.r3 = (C1*I2)+(G1*J2)+(K1*K2)+(O1*L2);
+    dst[ itr ].c3.r4 = (D1*I2)+(H1*J2)+(L1*K2)+(P1*L2);
+
+    dst[ itr ].c4.r1 = (A1*M2)+(E1*N2)+(I1*O2)+(M1*P2);
+    dst[ itr ].c4.r2 = (B1*M2)+(F1*N2)+(J1*O2)+(N1*P2);
+    dst[ itr ].c4.r3 = (C1*M2)+(G1*N2)+(K1*O2)+(O1*P2);
+    dst[ itr ].c4.r4 = (D1*M2)+(H1*N2)+(L1*O2)+(P1*P2);
+  );
+
+  #undef A1
+  #undef A2
+  #undef B1
+  #undef B2
+  #undef C1
+  #undef C2
+  #undef D1
+  #undef D2
+  #undef E1
+  #undef E2
+  #undef F1
+  #undef F2
+  #undef G1
+  #undef G2
+  #undef H1
+  #undef H2
+  #undef I1
+  #undef I2
+  #undef J1
+  #undef J2
+  #undef K1
+  #undef K2
+  #undef L1
+  #undef L2
+  #undef M1
+  #undef M2
+  #undef N1
+  #undef N2
+  #undef O1
+  #undef O2
+  #undef P1
+  #undef P2
+}
diff --git a/source/NE10_mulmat.neon.s b/source/NE10_mulmat.neon.s
new file mode 100644
index 0000000..0711990
--- /dev/null
+++ b/source/NE10_mulmat.neon.s
@@ -0,0 +1,518 @@
+@ COPYRIGHT NOTICE TBD NOT FOR RELEASE
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+
+
+
+        .balign   4
+        .global   mulmat_2x2f_neon
+        .thumb
+        .thumb_func
+
+mulmat_2x2f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mulmat_2x2f(arm_mat2x2f_t * dst,
+        @                 arm_mat2x2f_t * src1,
+        @                 arm_mat2x2f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_mat2x2
+
+        @ We load four 2x2 matrices at a time, multiply them to
+        @ get two resulting 2x2 matrices, store them in the destination
+        @ and then move on to the next four matrices.
+
+        @ load the 1st set of values
+          vld4.32         {  d0,  d1,  d2,  d3 }, [r1]!
+          vld4.32         {  d4,  d5,  d6,  d7 }, [r2]!
+          subs            r3, r3, #4          @ 2 for this set, and 2 for the 2nd set
+
+        @ calculate values for the 1st set
+          vmul.f32        d16,  d0,  d4
+          vmul.f32        d17,  d1,  d4
+          vmul.f32        d18,  d0,  d6
+          vmul.f32        d19,  d1,  d6
+
+          vmla.f32        d16,  d2,  d5
+          vmla.f32        d17,  d3,  d5
+          vmla.f32        d18,  d2,  d7
+          vmla.f32        d19,  d3,  d7
+
+
+        @ load the 2nd set of values
+          vld4.32         {  d0,  d1,  d2,  d3 }, [r1]!
+          vld4.32         {  d4,  d5,  d6,  d7 }, [r2]!
+
+          ble             .L_mainloopend_mat2x2
+
+.L_mainloop_mat2x2:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst4.32         { d16, d17, d18, d19}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          vmul.f32        d16,  d0,  d4
+          vmul.f32        d17,  d1,  d4
+          vmul.f32        d18,  d0,  d6
+          vmul.f32        d19,  d1,  d6
+
+          vmla.f32        d16,  d2,  d5
+          vmla.f32        d17,  d3,  d5
+          vmla.f32        d18,  d2,  d7
+          vmla.f32        d19,  d3,  d7
+
+       @ load the next (e.g. 3rd) set of values
+          subs            r3, r3, #2
+          vld4.32         {  d0,  d1,  d2,  d3 }, [r1]!
+          vld4.32         {  d4,  d5,  d6,  d7 }, [r2]!
+
+
+        bgt             .L_mainloop_mat2x2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst4.32         { d16, d17, d18, d19}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          vmul.f32        d16,  d0,  d4
+          vmul.f32        d17,  d1,  d4
+          vmul.f32        d18,  d0,  d6
+          vmul.f32        d19,  d1,  d6
+
+          vmla.f32        d16,  d2,  d5
+          vmla.f32        d17,  d3,  d5
+          vmla.f32        d18,  d2,  d7
+          vmla.f32        d19,  d3,  d7
+
+        @ store the result for the last (e.g. 3rd) set
+          vst4.32         { d16, d17, d18, d19}, [r0]!
+
+
+.L_check_mat2x2:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+     @ process the last few items left in the input array
+       vld4.32         {  d0[0],  d1[0],  d2[0],  d3[0] }, [r1]!
+       vld4.32         {  d4[0],  d5[0],  d6[0],  d7[0] }, [r2]!
+
+        subs              r4, r4, #1
+
+        @ calculate values
+          vmul.f32        d16,  d0,  d4
+          vmul.f32        d17,  d1,  d4
+          vmul.f32        d18,  d0,  d6
+          vmul.f32        d19,  d1,  d6
+
+          vmla.f32        d16,  d2,  d5
+          vmla.f32        d17,  d3,  d5
+          vmla.f32        d18,  d2,  d7
+          vmla.f32        d19,  d3,  d7
+
+       vst4.32           { d16[0], d17[0], d18[0], d19[0] }, [r0]!
+
+        bgt               .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @ A macro to load four 3x3 matrices, two from the first source which
+        @ according to the function signatures is src1 (r1) and
+        @ another two from the second source which is src2 (r2)
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        .macro LOAD_3x3MATS
+
+            # load two 3x3 matrices from src1
+            vld1.32         {  q0-q1 }, [r1]!
+            vld1.32         {  d8[0] }, [r1]!
+            vld1.32         {  q2-q3 }, [r1]!
+            vld1.32         {  d8[1] }, [r1]!
+
+            # load two 3x3 matrices from src2
+            vld1.32         {   q8-q9 }, [r2]!
+            vld1.32         {   d9[0] }, [r2]!
+            vld1.32         { q10-q11 }, [r2]!
+            vld1.32         {   d9[1] }, [r2]!
+
+
+             # rearrange them both
+             vtrn.32     q0,  q2
+             vtrn.32     q1,  q3
+
+             vtrn.32     q8, q10
+             vtrn.32     q9, q11
+
+        .endm
+
+
+
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ This macro multiplies two pairs of 3x3 matrices that were
+       @ loaded using the above LOAD_3x3MATS macro in registers q0-q11.
+       @ The two resulting matrices are returned in q12, q13, q14, q15, & d9 
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       .macro MULTIPLY_3x3MATS
+
+           @ a =  d0  &  d16
+           @ b =  d4  &  d20
+           @ c =  d1  &  d17
+           @ d =  d5  &  d21
+           @ e =  d2  &  d18
+           @ f =  d6  &  d22
+           @ g =  d3  &  d19
+           @ h =  d7  &  d23
+           @ i =  d8  &   d9
+ 
+           vmul.f32     d24, d0, d16
+           vmul.f32     d28, d4, d16
+           vmul.f32     d25, d1, d16
+           vmul.f32     d29, d0, d21
+           vmul.f32     d26, d4, d21
+           vmul.f32     d30, d1, d21
+           vmul.f32     d27, d0, d19
+           vmul.f32     d31, d4, d19
+           vmul.f32     d10, d1, d19
+
+           vmla.f32     d24, d5, d20
+           vmla.f32     d28, d2, d20
+           vmla.f32     d25, d6, d20
+           vmla.f32     d29, d5, d18
+           vmla.f32     d26, d2, d18
+           vmla.f32     d30, d6, d18
+           vmla.f32     d27, d5, d23
+           vmla.f32     d31, d2, d23
+           vmla.f32     d10, d6, d23
+
+           vmla.f32     d24, d3, d17
+           vmla.f32     d28, d7, d17
+           vmla.f32     d25, d8, d17
+           vmla.f32     d29, d3, d22
+           vmla.f32     d26, d7, d22
+           vmla.f32     d30, d8, d22
+           vmla.f32     d27, d3,  d9
+           vmla.f32     d31, d7,  d9
+           vmla.f32     d10, d8,  d9
+
+       .endm
+
+
+
+
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @ A macro to store the two resulting 3x3 matrices from
+        @ the above MULTIPLY_3x3MATS macro (q12-q15, & d9 are stored)
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        .macro STORE_3x3MATS
+
+             # rearrange them both
+             vtrn.32     q12,  q14
+             vtrn.32     q13,  q15
+
+            # store two 3x3 matrices to dst
+            vst1.32         { q12-q13 }, [r0]!
+            vst1.32         {  d10[0] }, [r0]!
+            vst1.32         { q14-q15 }, [r0]!
+            vst1.32         {  d10[1] }, [r0]!
+
+        .endm
+
+
+
+
+        .align  2
+        .global mulmat_3x3f_neon
+        .thumb
+        .thumb_func
+mulmat_3x3f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mulmat_3x3f(arm_mat3x3f_t * dst,
+        @                 arm_mat3x3f_t * src1,
+        @                 arm_mat3x3f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              { r4 }
+        vpush             { d8, d9, d10 }
+        and               r4, r3, #3          @ r3 = count % 4;
+        sub               r3, r3, r4          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_mat3x3
+
+        @ load the 1st set of values
+          LOAD_3x3MATS
+          subs            r3, r3, #4          @ 2 for this set, and 2 for the 2nd set
+
+        @ calculate values for the 1st set
+          MULTIPLY_3x3MATS
+
+        @ load the 2nd set of values
+          LOAD_3x3MATS
+          ble             .L_mainloopend_mat3x3
+
+.L_mainloop_mat3x3:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          STORE_3x3MATS
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          MULTIPLY_3x3MATS
+
+        @ load the next (e.g. 3rd) set of values
+          LOAD_3x3MATS
+
+          subs            r3, r3, #2
+
+        bgt               .L_mainloop_mat3x3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          STORE_3x3MATS
+
+        @ calculate values for the last (e.g. 3rd) set
+          MULTIPLY_3x3MATS
+
+        @ store the result for the last (e.g. 3rd) set
+          STORE_3x3MATS
+
+.L_check_mat3x3:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+     @ process the last few items left in the input array
+       @ load the next (e.g. 3rd) set of values
+            vld1.32         {  q0-q1 }, [r1]!
+            vld1.32         {  d8[0] }, [r1]!
+            vld1.32         {   q8-q9 }, [r2]!
+            vld1.32         {   d9[0] }, [r2]!
+
+             vtrn.32     q0,  q2
+             vtrn.32     q1,  q3
+
+             vtrn.32     q8, q10
+             vtrn.32     q9, q11
+
+          subs            r4, r4, #1
+
+        @ calculate values for the last (e.g. 3rd) set
+          MULTIPLY_3x3MATS
+
+        @ store the result for the last (e.g. 3rd) set
+             vtrn.32     q12,  q14
+             vtrn.32     q13,  q15
+
+            vst1.32         { q12-q13 }, [r0]!
+            vst1.32         {  d10[0] }, [r0]!
+
+
+        bgt               .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+     @ return
+        vpop              { d8, d9, d10 }
+        pop               { r4 }
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @ A macro to load a pair of 4x4 matrices from src1 (r1) and
+        @ src2 (r2) into registers q0-q3 & q8-q11.
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        .macro LOAD_4x4MATS
+
+            # load a 4x4 matrix from src1
+            vld1.32         { q8-q9 }, [r1]!
+            vld1.32         {q10-q11}, [r1]!
+
+            # load a 4x4 matrix from src2
+            vld1.32         {q0-q1}, [r2]!
+            vld1.32         {q2-q3}, [r2]!
+        .endm
+
+
+
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ This macro multiplies the two 4x4 matrices loaded in the
+       @ above LOAD_4x4MATS macro and returns the resulting 4x4
+       @ matrix in q12-q15.
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       .macro MULTIPLY_4x4MATS
+
+           vmul.f32   q12,  q8, d0[0]
+           vmul.f32   q13,  q8, d2[0]
+           vmul.f32   q14,  q8, d4[0]
+           vmul.f32   q15,  q8, d6[0]
+
+           vmla.f32   q12,  q9, d0[1]
+           vmla.f32   q13,  q9, d2[1]
+           vmla.f32   q14,  q9, d4[1]
+           vmla.f32   q15,  q9, d6[1]
+
+
+           vmla.f32   q12, q10, d1[0]
+           vmla.f32   q13, q10, d3[0]
+           vmla.f32   q14, q10, d5[0]
+           vmla.f32   q15, q10, d7[0]
+
+           vmla.f32   q12, q11, d1[1]
+           vmla.f32   q13, q11, d3[1]
+           vmla.f32   q14, q11, d5[1]
+           vmla.f32   q15, q11, d7[1]
+
+       .endm
+
+
+
+
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @ This macro stores the resulting 4x4 matrix which is
+        @ returned by the above MULTIPLY_4x4MATS macro from registers
+        @ q12-q15 into the dst (r0).
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        .macro STORE_4x4MATS
+
+            # store two 3x3 matrices to dst
+            vst1.32         { q12-q13 }, [r0]!
+            vst1.32         { q14-q15 }, [r0]!
+
+        .endm
+
+
+
+
+        .align  2
+        .global mulmat_4x4f_neon
+        .thumb
+        .thumb_func
+mulmat_4x4f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mulmat_4x4f(arm_mat4x4f_t * dst,
+        @                 arm_mat4x4f_t * src1,
+        @                 arm_mat4x4f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r4; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_mat4x4
+
+        @ load the 1st set of values
+          LOAD_4x4MATS
+
+          subs            r3, r3, #2
+
+        @ calculate values for the 1st set
+          MULTIPLY_4x4MATS
+
+        @ load the 2nd set of values
+          LOAD_4x4MATS
+
+          ble             .L_mainloopend_mat4x4
+
+.L_mainloop_mat4x4:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          STORE_4x4MATS
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          MULTIPLY_4x4MATS
+
+       @ load the next (e.g. 3rd) set of values
+          subs            r3, r3, #1
+          LOAD_4x4MATS
+
+        bgt               .L_mainloop_mat4x4             @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          STORE_4x4MATS
+
+        @ calculate values for the last (e.g. 3rd) set
+          MULTIPLY_4x4MATS
+
+        @ store the result for the last (e.g. 3rd) set
+          STORE_4x4MATS
+
+.L_check_mat4x4:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+     @ process the last few items left in the input array
+          LOAD_4x4MATS
+
+          subs            r4, r4, #1
+
+        @ calculate values
+          MULTIPLY_4x4MATS
+
+        @ store the results
+          STORE_4x4MATS
+
+        bgt               .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
diff --git a/source/NE10_mulmat_test.c b/source/NE10_mulmat_test.c
new file mode 100644
index 0000000..a62e5ec
--- /dev/null
+++ b/source/NE10_mulmat_test.c
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mulmat_test.c
+ */
+
+// TODO: Apply some sort of self-verifying method to the test system.
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN_MATRICES
+// number of the operations in a given unit
+#define OP_COUNT 3
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_xmat_operation_x.h"
+
+extern arm_result_t mulmat_2x2f_c   (arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t mulmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+
+extern arm_result_t mulmat_3x3f_c   (arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t mulmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+
+extern arm_result_t mulmat_4x4f_c   (arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t mulmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) mulmat_2x2f_c;
+   ftbl[ 1] = (arm_func_4args_t) mulmat_2x2f_c; // using the c version in place of the assembly version
+   ftbl[ 2] = (arm_func_4args_t) mulmat_2x2f_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) mulmat_3x3f_c;
+   ftbl[ 4] = (arm_func_4args_t) mulmat_3x3f_c; // using the c version in place of the assembly version
+   ftbl[ 5] = (arm_func_4args_t) mulmat_3x3f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) mulmat_4x4f_c;
+   ftbl[ 7] = (arm_func_4args_t) mulmat_4x4f_c; // using the c version in place of the assembly version
+   ftbl[ 8] = (arm_func_4args_t) mulmat_4x4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}