LDFLAGS+=-L. -L/usr/local/lib -L/client/lib -L/lib/arm-linux-gnueabi
LDFLAGS+=-lm
-ALLFILES = NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o NE10_dot.c_r.o NE10_dot.neon_r.o NE10_cross.c_r.o NE10_cross.neon_r.o NE10_addmat.c_r.o NE10_addmat.neon_r.o NE10_submat.c_r.o NE10_submat.neon_r.o NE10_mulmat.c_r.o NE10_mulmat.neon_r.o
+ALLFILES = NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o NE10_dot.c_r.o NE10_dot.neon_r.o NE10_cross.c_r.o NE10_cross.neon_r.o NE10_addmat.c_r.o NE10_addmat.neon_r.o NE10_submat.c_r.o NE10_submat.neon_r.o NE10_mulmat.c_r.o NE10_mulmat.neon_r.o NE10_mulcmatvec.c_r.o NE10_mulcmatvec.neon_r.o
#TARGET_ARCH = stdc
mulmat_2x2f = mulmat_2x2f_neon;
mulmat_3x3f = mulmat_3x3f_neon;
mulmat_4x4f = mulmat_4x4f_neon;
+
+ mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_neon;
+ mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_neon;
+ mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_neon;
}
else
{
mulmat_2x2f = mulmat_2x2f_c;
mulmat_3x3f = mulmat_3x3f_c;
mulmat_4x4f = mulmat_4x4f_c;
+
+ mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_c;
+ mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_c;
+ mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_c;
}
}
arm_result_t (*mulmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
arm_result_t (*mulmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-
+arm_result_t (*mulcmatvec_cm4x4f_v4f)(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
+arm_result_t (*mulcmatvec_cm3x3f_v3f)(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
+arm_result_t (*mulcmatvec_cm2x2f_v2f)(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
+#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
+#define NE10_CMATVEC_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
#define NE10_LEN_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
NE10_DstSrc_OPERATION_VEC2F_NEON( \
NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_addmat_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+#include "../inc/NE10_types.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_4args_t)(void * dst, void * cst, void * src, unsigned int count);
+arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0; // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl = -1; // selects which particular implementation of the chosen operation must run
+int mute = 0; // 0 == print output; 1 == do not print anything;
+
+struct timeval before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "16" components in a matrix
+#define MAX_VEC_COMPONENTS 16
+
+arm_float_t * guarded_cst = NULL;
+arm_float_t * guarded_src = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thecst = NULL;
+arm_float_t * thesrc = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+ const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+ // initialize if not done so
+ if ( 0 == done_init )
+ {
+ guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+ GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + fixed_length );
+ thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
+ FILL_FLOAT_ARRAY_LIMIT_GT1( thecst, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+ guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+ GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
+ thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
+ FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+ for ( i = 0; i<IMPL_COUNT; i++ )
+ {
+ guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+ GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+ thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+ }
+
+ done_init = 1;
+ }
+
+ // sample run
+ MEASURE( dt_test_sample,
+ ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, thesrc, ARRLEN );
+ );
+ if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+ ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+ ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
+ {
+ fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+ exit( NE10_ERR );
+ }
+
+ // this test to make sure passing zero as the length won't cause segmentation faults
+ ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, thesrc, 0 );
+
+ MEASURE( elapsed,
+ for ( i = 0; i < max; i++ )
+ {
+ // call the function
+ ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thecst, thesrc, ARRLEN );
+ }
+ );
+
+ if ( !mute )
+ printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+ ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+ if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+ {
+ opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+ if ( opcode == 0 ) return OP_COUNT;
+ exit( NE10_ERR );
+ } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+ {
+ opcode = atoi ( argv[1] );
+ if ( opcode <= 0 ) exit( NE10_ERR );
+ impl = atoi ( argv[2] );
+ if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+ max = atoi ( argv[3] );
+ if ( max <= 0 ) exit( NE10_ERR );
+ } else exit( NE10_ERR );
+
+ // initialize the table with NULL
+ memset( ftbl, 0, sizeof(ftbl));
+
+ // manually initialize the functions which have actual implementations
+ init_ftbl(); // this function is implemented in the unit test source file
+
+ if ( opcode <= 0 || opcode > OP_COUNT
+ || impl < 0 || impl > IMPL_COUNT )
+ {
+ fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+ exit( NE10_ERR );
+ }
+
+ if ( impl == 0 ) // run all implementations and verify
+ {
+ // first, make sure all of the implementations do exist
+ for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+ {
+ if ( NULL == ftbl[i] )
+ {
+ fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+ exit( NE10_ERR );
+ }
+ }
+
+ // try all the implementatins here..
+ mute = 1; // do not print anything
+
+ // opcode remains the same but we iterate through different implementations here..
+ for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+ {
+ test_operation();
+ }
+
+ // now verify
+ arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+ int warns = 0;
+ int item_width = opcode+1; // using the opcode (1=mat2x2, 2=mat3x3, ...)
+ const int item_width_p2 = item_width * item_width;
+ _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+ for ( i = 0; i < ARRLEN; i++ )
+ {
+ for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+ {
+ memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
+ }
+
+ int pos = 0;
+ for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+ {
+ for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+ {
+ assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+ assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
+
+ if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
+ { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+ opcode, impl, i, pos+1 );
+ warns++; }
+
+ // stop after 10 warnings
+ if ( warns >= ACCEPTABLE_WARNS_MATRICES )
+ { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+ exit( NE10_ERR );
+ }
+ }
+ }
+ }
+ free( _output ); _output = (arm_float_t *) NULL;
+
+ if ( warns < ACCEPTABLE_WARNS_MATRICES )
+ {
+ return NE10_OK;
+ }
+
+ fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+ exit( NE10_ERR );
+ }
+ else // run a particular implementation
+ {
+ if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+ // ge the overhead
+ MEASURE( dt_test_overhead,
+ for ( i = 0 ; i < max; i++ )
+ {
+ }
+ );
+
+ test_operation();
+ }
+
+
+
+ // free any allocated memory...
+ free( guarded_cst );
+ free( guarded_src );
+ for ( i = 0; i<IMPL_COUNT; i++ )
+ {
+ free( guarded_dst[i] );
+ }
+
+ return NE10_OK;
+}
// ## Operations on Matrices ##
extern arm_result_t (*invert_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t (*det_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t (*det_mat4x4f)(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t (*trans_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t (*identity_mat4x4f)(arm_mat4x4f_t * dst, unsigned int count);
extern arm_result_t (*invert_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t (*det_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t (*det_mat3x3f)(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t (*trans_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t (*identity_mat3x3f)(arm_mat3x3f_t * dst, unsigned int count);
extern arm_result_t (*invert_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t (*det_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t (*det_mat2x2f)(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t (*trans_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t (*identity_mat2x2f)(arm_mat2x2f_t * dst, unsigned int count);
// ## Matrix-Vector Algebra ##
-extern arm_result_t (*trans_mat4x4f_vec4f)(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t (*trans_mat3x3f_vec4f)(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t (*trans_mat2x2f_vec4f)(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
-
+extern arm_result_t (*mulcmatvec_cm4x4f_v4f)(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t (*mulcmatvec_cm3x3f_v3f)(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t (*mulcmatvec_cm2x2f_v2f)(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
// ## Matrix-Matrix Algebra ##
// ## Matrix-Vector Algebra ##
-extern arm_result_t trans_mat4x4f_vec4f_asm(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t trans_mat3x3f_vec4f_asm(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t trans_mat2x2f_vec4f_asm(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t mulcmatvec_cm4x4f_v4f_asm(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm3x3f_v3f_asm(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm2x2f_v2f_asm(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
+
// ## Matrix-Vector Algebra ##
-extern arm_result_t trans_mat4x4f_vec4f_c(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t trans_mat3x3f_vec4f_c(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t trans_mat2x2f_vec4f_c(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
-
+extern arm_result_t mulcmatvec_cm4x4f_v4f_c(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm3x3f_v3f_c(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm2x2f_v2f_c(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
// ## Matrix-Matrix Algebra ##
// ## Matrix-Vector Algebra ##
-extern arm_result_t trans_mat4x4f_vec4f_neon(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t trans_mat3x3f_vec4f_neon(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
-extern arm_result_t trans_mat2x2f_vec4f_neon(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t mulcmatvec_cm4x4f_v4f_neon(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm3x3f_v3f_neon(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm2x2f_v2f_neon(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
+
addmat
submat
mulmat
+mulcmatvec
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mulcmatvec.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mulcmatvec.neon.s
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t mulcmatvec_cm2x2f_v2f_c (arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count)
+{
+ #define A1 cst->c1.r1
+ #define B1 cst->c1.r2
+ #define C1 cst->c2.r1
+ #define D1 cst->c2.r2
+
+ NE10_CMATVEC_OPERATION_X_C
+ (
+ dst[ itr ].x = A1 * src[ itr ].x + C1 * src[ itr ].y;
+ dst[ itr ].y = B1 * src[ itr ].x + D1 * src[ itr ].y;
+ );
+
+ #undef A1
+ #undef B1
+ #undef C1
+ #undef D1
+}
+
+arm_result_t mulcmatvec_cm3x3f_v3f_c (arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count)
+{
+ #define A1 cst->c1.r1
+ #define B1 cst->c1.r2
+ #define C1 cst->c1.r3
+ #define D1 cst->c2.r1
+ #define E1 cst->c2.r2
+ #define F1 cst->c2.r3
+ #define G1 cst->c3.r1
+ #define H1 cst->c3.r2
+ #define I1 cst->c3.r3
+
+ NE10_CMATVEC_OPERATION_X_C
+ (
+ dst[ itr ].x = A1 * src[ itr ].x + D1 * src[ itr ].y + G1 * src[ itr ].z;
+ dst[ itr ].y = B1 * src[ itr ].x + E1 * src[ itr ].y + H1 * src[ itr ].z;
+ dst[ itr ].z = C1 * src[ itr ].x + F1 * src[ itr ].y + I1 * src[ itr ].z;
+ );
+
+ #undef A1
+ #undef B1
+ #undef C1
+ #undef D1
+ #undef E1
+ #undef F1
+ #undef G1
+ #undef H1
+ #undef I1
+}
+
+extern arm_result_t mulcmatvec_cm4x4f_v4f_c (arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count)
+{
+ #define A1 cst->c1.r1
+ #define B1 cst->c1.r2
+ #define C1 cst->c1.r3
+ #define D1 cst->c1.r4
+ #define E1 cst->c2.r1
+ #define F1 cst->c2.r2
+ #define G1 cst->c2.r3
+ #define H1 cst->c2.r4
+ #define I1 cst->c3.r1
+ #define J1 cst->c3.r2
+ #define K1 cst->c3.r3
+ #define L1 cst->c3.r4
+ #define M1 cst->c4.r1
+ #define N1 cst->c4.r2
+ #define O1 cst->c4.r3
+ #define P1 cst->c4.r4
+
+ NE10_CMATVEC_OPERATION_X_C
+ (
+ dst[ itr ].x = A1 * src[ itr ].x + E1 * src[ itr ].y + I1 * src[ itr ].z + M1 * src[ itr ].w;
+ dst[ itr ].y = B1 * src[ itr ].x + F1 * src[ itr ].y + J1 * src[ itr ].z + N1 * src[ itr ].w;
+ dst[ itr ].z = C1 * src[ itr ].x + G1 * src[ itr ].y + K1 * src[ itr ].z + O1 * src[ itr ].w;
+ dst[ itr ].w = D1 * src[ itr ].x + H1 * src[ itr ].y + L1 * src[ itr ].z + P1 * src[ itr ].w;
+ );
+
+ #undef A1
+ #undef B1
+ #undef C1
+ #undef D1
+ #undef E1
+ #undef F1
+ #undef G1
+ #undef H1
+ #undef I1
+ #undef J1
+ #undef K1
+ #undef L1
+ #undef M1
+ #undef N1
+ #undef O1
+ #undef P1
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mulcmatvec.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "headers/NE10header.s"
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies a single 2x2 matrix by eight vec2's
+ @ The elements of the vectors are loaded into registers q8-q11
+ @ by the caller (mulcmatvec_cm2x2f_v2f_neon) in the following
+ @ order:
+ @
+ @ d16=(x1,x3) d18=(y1,y3) d20=(x2,x4) d22=(y2,y4);
+ @ d17=(x5,x7) d19=(y5,y7) d21=(x6,x8) d23=(y6,y8);
+ @
+ @ This macro multiplies these eight vectors by the 2x2 matrix
+ @ which is stored in registers d0[0],d1[0],d2[0], and d3[0].
+ @ The resulting eight vectors are returned in q12-q15
+ @ in the same order as shown above.
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MUL_MAT2x2_VEC2
+ vmul.f32 q12, q8 , d0[0] @ a*x1,x3
+ vmul.f32 q14, q10, d0[0] @ x2,x4
+ vmul.f32 q8 , q8 , d1[0] @ b*x1,x3
+ vmul.f32 q10, q10, d1[0] @ x2,x4
+ vmul.f32 q13, q9 , d2[0] @ c*y1,y3
+ vmul.f32 q15, q11, d2[0] @ y2,y4
+ vmul.f32 q9 , q9 , d3[0] @ d*y1,y3
+ vmul.f32 q11, q11, d3[0] @ y2,y4
+
+ vadd.f32 q14, q14, q15 @ 3) res24.x = a*x2,x4 + c*y2,y4 @ These results need to be stored in the order noted
+ vadd.f32 q12, q12, q13 @ 1) res13.x = a*x1,x3 + c*y1,y3
+ vadd.f32 q15, q10, q11 @ 4) res24.y = b*x2,x4 + d*y2,y4
+ vadd.f32 q13, q8 , q9 @ 2) res13.y = b*x1,x3 + d*y1,y3
+ .endm
+
+
+
+
+ .balign 4
+ .global mulcmatvec_cm2x2f_v2f_neon
+ .thumb
+ .thumb_func
+
+mulcmatvec_cm2x2f_v2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulcmatvec_cm2x2f_v2f ( arm_vec2f_t * dst,
+ @ const arm_mat2x2f_t * cst,
+ @ arm_vec2f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ (this register is updated and mvoed to the next entry
+ @ after every store operation)
+ @ r1: *cst, memory pointer to where the constant matrix is kept
+ @ r2: *src & current src entry's address
+ @ r3: int count & the number of items in the input array
+ @
+ @ r4: the number of items that are left to be processed at the
+ @ end of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_mat2x2
+
+ @ First we load the constant 2x2 matrix, then each time we load
+ @ eight vectors of 2-floats, multiply each vector with the matrix,
+ @ finally store the resutlting vectors in the destination memory
+ @ address, and move on to the next four vectors.
+
+ @ load the constant matrix
+ @ d0 = m11(a) d2 = m12(c)
+ @ d1 = m21(b) d3 = m22(d)
+ vld4.32 { d0[0], d1[0], d2[0], d3[0] }, [r1]
+
+
+ @ load the 1st set of values
+ @ if {V1, V2, ..., V8} are eight vec2's in memory
+ @ then after the load operations the eithet vectors
+ @ are stored in registers q8-q11 like so:
+ @
+ @ d16=(x1,x3) d18=(y1,y3) d20=(x2,x4) d22=(y2,y4);
+ @ d17=(x5,x7) d19=(y5,y7) d21=(x6,x8) d23=(y6,y8);
+
+ vld4.32 { d16, d18, d20, d22 }, [r2]!
+ vld4.32 { d17, d19, d21, d23 }, [r2]!
+
+ subs r3, r3, #16 @ 8 for this set, and 8 for the 2nd set
+
+ @ calculate values for the 1st set
+ MUL_MAT2x2_VEC2
+
+ @ load the 2nd set of values
+ vld4.32 { d16, d18, d20, d22 }, [r2]!
+ vld4.32 { d17, d19, d21, d23 }, [r2]!
+
+ ble .L_mainloopend_mat2x2
+
+.L_mainloop_mat2x2:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ vst4.32 { d24, d26, d28, d30 }, [r0]!
+ vst4.32 { d25, d27, d29, d31 }, [r0]!
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ MUL_MAT2x2_VEC2
+
+ @ load the next (e.g. 3rd) set of values
+ subs r3, r3, #8
+ vld4.32 { d16, d18, d20, d22 }, [r2]!
+ vld4.32 { d17, d19, d21, d23 }, [r2]!
+
+
+ bgt .L_mainloop_mat2x2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ vst4.32 { d24, d26, d28, d30 }, [r0]!
+ vst4.32 { d25, d27, d29, d31 }, [r0]!
+
+
+ @ calculate values for the last (e.g. 3rd) set
+ MUL_MAT2x2_VEC2
+
+ @ store the result for the last (e.g. 3rd) set
+ vst4.32 { d24, d26, d28, d30 }, [r0]!
+ vst4.32 { d25, d27, d29, d31 }, [r0]!
+
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vld2.32 { d16[0], d18[0] }, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ MUL_MAT2x2_VEC2
+
+ @ store the results
+ vst2.32 { d24[0], d26[0] }, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to load four vec3's into registers q8-q10
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_FOUR_VEC3
+ vld3.32 { d16, d18, d20 }, [r2]!
+ vld3.32 { d17, d19, d21 }, [r2]!
+ .endm
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies the constant 3x3 matrix loaded into
+ @ registers d0-d5 by four vec3's that the above macro LOAD_FOUR_VEC3
+ @ loads. The resuls are returned in registers q11, q12, and and q13
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MUL_MAT3x3_VEC3
+
+ vmul.f32 q11, q8 , d0[0]
+ vmla.f32 q11, q9 , d0[1]
+ vmla.f32 q11, q10, d1[0]
+
+ vmul.f32 q12, q8 , d2[0]
+ vmla.f32 q12, q9 , d2[1]
+ vmla.f32 q12, q10, d3[0]
+
+ vmul.f32 q13, q8 , d4[0]
+ vmla.f32 q13, q9 , d4[1]
+ vmla.f32 q13, q10, d5[0]
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to store the resulting vec3's that were returned in
+ @ registers q11 to q13 in the above macro MUL_MAT3x3_VEC3.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_FOUR_VEC3
+
+ vst3.32 { d22, d24, d26 }, [r0]!
+ vst3.32 { d23, d25, d27 }, [r0]!
+
+ .endm
+
+
+
+
+ .align 2
+ .global mulcmatvec_cm3x3f_v3f_neon
+ .thumb
+ .thumb_func
+
+mulcmatvec_cm3x3f_v3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulcmatvec_cm3x3f_v3f ( arm_vec3f_t * dst,
+ @ const arm_mat3x3f_t * cst,
+ @ arm_vec3f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ (this register is updated and mvoed to the next entry
+ @ after every store operation)
+ @ r1: *cst, memory pointer to where the constant matrix is kep
+ @ r2: *src & current src entry's gddress
+ @ r3: int count & the number of items in the input array
+ @
+ @ r4: the number of items that are left to be processed at the
+ @ end of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push { r4 }
+ and r4, r3, #3 @ r3 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_mat3x3
+
+ @ First we load the constant 3x3 matrix, then each time we load
+ @ four vectors of 3-floats, multiply each vector with the matrix,
+ @ finally store the resutlting vectors in the destination memory
+ @ address, and move on to the next four vectors.
+
+ @ load the constant matrix into q0-q2
+ vld3.32 { d0 , d2 , d4 }, [r1]!
+ vld3.32 { d1[0], d3[0], d5[0] }, [r1]
+
+
+ @ load the 1st set of values
+ LOAD_FOUR_VEC3
+ subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set
+
+ @ calculate values for the 1st set
+ MUL_MAT3x3_VEC3
+
+ @ load the 2nd set of values
+ LOAD_FOUR_VEC3
+ ble .L_mainloopend_mat3x3
+
+.L_mainloop_mat3x3:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ STORE_FOUR_VEC3
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ MUL_MAT3x3_VEC3
+
+ @ load the next (e.g. 3rd) set of values
+ LOAD_FOUR_VEC3
+
+ subs r3, r3, #4
+
+ bgt .L_mainloop_mat3x3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ STORE_FOUR_VEC3
+
+ @ calculate values for the last (e.g. 3rd) set
+ MUL_MAT3x3_VEC3
+
+ @ store the result for the last (e.g. 3rd) set
+ STORE_FOUR_VEC3
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+ vld3.32 { d16[0], d18[0], d20[0] }, [r2]!
+
+ subs r4, r4, #1
+
+ MUL_MAT3x3_VEC3
+
+ vst3.32 { d22[0], d24[0], d26[0] }, [r0]!
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+ pop { r4 }
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to load four vec4's into registers q8-q11.
+ @ This macro uses r2 (the thirs parameter in
+ @ mulcmatvec_cm4x4f_v4f_neon) as the address register.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_FOUR_VEC4
+ vld4.32 { d16, d18, d20, d22 }, [r2]!
+ vld4.32 { d17, d19, d21, d23 }, [r2]!
+ .endm
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies the constant 4x4 matrix that is loaded
+ @ in mulcmatvec_cm4x4f_v4f_neon by four vec4's that are loaded in
+ @ the above macro LOAD_FOUR_VEC4.
+ @ The resulting four vectors are returned in registers q12 to q15.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MUL_MAT4x4_VEC4
+
+ vmul.f32 q12, q8 , d0[0]
+ vmla.f32 q12, q9 , d0[1]
+ vmla.f32 q12, q10, d1[0]
+ vmla.f32 q12, q11, d1[1]
+
+ vmul.f32 q13, q8 , d2[0]
+ vmla.f32 q13, q9 , d2[1]
+ vmla.f32 q13, q10, d3[0]
+ vmla.f32 q13, q11, d3[1]
+
+ vmul.f32 q14, q8 , d4[0]
+ vmla.f32 q14, q9 , d4[1]
+ vmla.f32 q14, q10, d5[0]
+ vmla.f32 q14, q11, d5[1]
+
+ vmul.f32 q15, q8 , d6[0]
+ vmla.f32 q15, q9 , d6[1]
+ vmla.f32 q15, q10, d7[0]
+ vmla.f32 q15, q11, d7[1]
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro stores the results from the above macro MUL_MAT4x4_VEC4
+ @ from registers q12-q15 in to the destination memory (r0) which is
+ @ the first parameter of mulcmatvec_cm4x4f_v4f_neon().
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_FOUR_VEC4
+
+ vst4.32 { d24, d26, d28, d30 }, [r0]!
+ vst4.32 { d25, d27, d29, d31 }, [r0]!
+
+ .endm
+
+
+
+
+ .align 2
+ .global mulcmatvec_cm4x4f_v4f_neon
+ .thumb
+ .thumb_func
+
+mulcmatvec_cm4x4f_v4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulcmatvec_cm4x4f_v4f ( arm_vec4f_t * dst,
+ @ const arm_mat4x4f_t * cst,
+ @ arm_vec4f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ (this register is updated and mvoed to the next entry
+ @ after every store operation)
+ @ r1: *cst, pointer to memory where the constant matrix is kept
+ @ r2: *src & current src entry's address
+ @ r3: int count & the number of items in the input array
+ @
+ @ r4: the number of items that are left to be processed at the
+ @ end of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_mat4x4
+
+ @ First we load the constant 4x4 matrix, then each time we load
+ @ four vectors of 4-floats, multiply each vector with the matrix,
+ @ finally store the resutlting vectors in the destination memory
+ @ address, and move on to the next four vectors.
+
+ @ load the constant matrix into q0-q3
+ vld4.32 { d0, d2, d4, d6 }, [r1]!
+ vld4.32 { d1, d3, d5, d7 }, [r1]
+
+ @ load the 1st set of values
+ LOAD_FOUR_VEC4
+
+ subs r3, r3, #8
+
+ @ calculate values for the 1st set
+ MUL_MAT4x4_VEC4
+
+ @ load the 2nd set of values
+ LOAD_FOUR_VEC4
+
+ ble .L_mainloopend_mat4x4
+
+.L_mainloop_mat4x4:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ STORE_FOUR_VEC4
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ MUL_MAT4x4_VEC4
+
+
+ @ load the next (e.g. 3rd) set of values
+ subs r3, r3, #4
+ LOAD_FOUR_VEC4
+
+
+ bgt .L_mainloop_mat4x4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ STORE_FOUR_VEC4
+
+
+ @ calculate values for the last (e.g. 3rd) set
+ MUL_MAT4x4_VEC4
+
+
+ @ store the result for the last (e.g. 3rd) set
+ STORE_FOUR_VEC4
+
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+ vld4.32 { d16[0], d18[0], d20[0], d22[0] }, [r2]!
+
+
+ subs r4, r4, #1
+
+ @ calculate values
+ MUL_MAT4x4_VEC4
+
+
+ @ store the results
+ vst4.32 { d24[0], d26[0], d28[0], d30[0] }, [r0]!
+
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mulcmatvec_test.c
+ */
+
+// TODO: Apply some sort of self-verifying method to the test system.
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN_MATRICES
+// number of the operations in a given unit
+#define OP_COUNT 3
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_mulcmatvec_operation_x.h"
+
+extern arm_result_t mulcmatvec_cm4x4f_v4f_c (arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm4x4f_v4f_neon (arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
+
+extern arm_result_t mulcmatvec_cm3x3f_v3f_c (arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm3x3f_v3f_neon (arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
+
+extern arm_result_t mulcmatvec_cm2x2f_v2f_c (arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t mulcmatvec_cm2x2f_v2f_neon (arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
+
+void init_ftbl()
+{
+ // manually initialize the global function table with
+ // those functions that do have an actual implementation.
+ ftbl[ 0] = (arm_func_4args_t) mulcmatvec_cm2x2f_v2f_c;
+ ftbl[ 1] = (arm_func_4args_t) mulcmatvec_cm2x2f_v2f_c; // using the c version in place of the assembly version
+ ftbl[ 2] = (arm_func_4args_t) mulcmatvec_cm2x2f_v2f_neon;
+
+ ftbl[ 3] = (arm_func_4args_t) mulcmatvec_cm3x3f_v3f_c;
+ ftbl[ 4] = (arm_func_4args_t) mulcmatvec_cm3x3f_v3f_c; // using the c version in place of the assembly version
+ ftbl[ 5] = (arm_func_4args_t) mulcmatvec_cm3x3f_v3f_neon;
+
+ ftbl[ 6] = (arm_func_4args_t) mulcmatvec_cm4x4f_v4f_c;
+ ftbl[ 7] = (arm_func_4args_t) mulcmatvec_cm4x4f_v4f_c; // using the c version in place of the assembly version
+ ftbl[ 8] = (arm_func_4args_t) mulcmatvec_cm4x4f_v4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+ return run_test( argc, argv ); // defined in "unit_test.h"
+}