LDFLAGS+=-L. -L/usr/local/lib -L/client/lib -L/lib/arm-linux-gnueabi
LDFLAGS+=-lm
+<<<<<<< HEAD
ALLFILES = NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o NE10_dot.c_r.o NE10_dot.neon_r.o NE10_cross.c_r.o NE10_cross.neon_r.o NE10_addmat.c_r.o NE10_addmat.neon_r.o NE10_submat.c_r.o NE10_submat.neon_r.o NE10_mulmat.c_r.o NE10_mulmat.neon_r.o NE10_mulcmatvec.c_r.o NE10_mulcmatvec.neon_r.o
+=======
+ALLFILES = NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o NE10_dot.c_r.o NE10_dot.neon_r.o NE10_cross.c_r.o NE10_cross.neon_r.o NE10_addmat.c_r.o NE10_addmat.neon_r.o NE10_submat.c_r.o NE10_submat.neon_r.o NE10_mulmat.c_r.o NE10_mulmat.neon_r.o NE10_detmat.c_r.o NE10_detmat.neon_r.o
+>>>>>>> New functions: Matrix determinant routines.
#TARGET_ARCH = stdc
mulmat_2x2f = mulmat_2x2f_neon;
mulmat_3x3f = mulmat_3x3f_neon;
mulmat_4x4f = mulmat_4x4f_neon;
-
mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_neon;
mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_neon;
mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_neon;
+ detmat_2x2f = detmat_2x2f_neon;
+ detmat_3x3f = detmat_3x3f_neon;
+ detmat_4x4f = detmat_4x4f_neon;
}
else
{
mulmat_2x2f = mulmat_2x2f_c;
mulmat_3x3f = mulmat_3x3f_c;
mulmat_4x4f = mulmat_4x4f_c;
-
mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_c;
mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_c;
mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_c;
+ detmat_2x2f = detmat_2x2f_c;
+ detmat_3x3f = detmat_3x3f_c;
+ detmat_4x4f = detmat_4x4f_c;
}
}
arm_result_t (*mulmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
arm_result_t (*mulmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
arm_result_t (*mulmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-
arm_result_t (*mulcmatvec_cm4x4f_v4f)(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
arm_result_t (*mulcmatvec_cm3x3f_v3f)(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
arm_result_t (*mulcmatvec_cm2x2f_v2f)(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
+arm_result_t (*detmat_4x4f)(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
+arm_result_t (*detmat_3x3f)(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
+arm_result_t (*detmat_2x2f)(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
); \
}
+#define NE10_DETMAT_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
///// The "DstAccSrc1Src2" group of functions //////
#define NE10_MLA_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_addmat_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+#include "../inc/NE10_types.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_3args_t)(void * dst, void * src, unsigned int count);
+arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0; // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl = -1; // selects which particular implementation of the chosen operation must run
+int mute = 0; // 0 == print output; 1 == do not print anything;
+
+struct timeval before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "16" components in a matrix
+#define MAX_VEC_COMPONENTS 16
+
+arm_float_t * guarded_src = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thesrc = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+ const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+ // initialize if not done so
+ if ( 0 == done_init )
+ {
+ guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+ GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
+ thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
+ FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+ for ( i = 0; i<IMPL_COUNT; i++ )
+ {
+ guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+ GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+ thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+ }
+
+ done_init = 1;
+ }
+
+ // sample run
+ MEASURE( dt_test_sample,
+ ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
+ );
+ if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+ ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
+ {
+ fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+ exit( NE10_ERR );
+ }
+
+ // this test to make sure passing zero as the length won't cause segmentation faults
+ ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, 0 );
+
+ MEASURE( elapsed,
+ for ( i = 0; i < max; i++ )
+ {
+ // call the function
+ ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
+ }
+ );
+
+ if ( !mute )
+ printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+ ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+ if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+ {
+ opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+ if ( opcode == 0 ) return OP_COUNT;
+ exit( NE10_ERR );
+ } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+ {
+ opcode = atoi ( argv[1] );
+ if ( opcode <= 0 ) exit( NE10_ERR );
+ impl = atoi ( argv[2] );
+ if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+ max = atoi ( argv[3] );
+ if ( max <= 0 ) exit( NE10_ERR );
+ } else exit( NE10_ERR );
+
+ // initialize the table with NULL
+ memset( ftbl, 0, sizeof(ftbl));
+
+ // manually initialize the functions which have actual implementations
+ init_ftbl(); // this function is implemented in the unit test source file
+
+ if ( opcode <= 0 || opcode > OP_COUNT
+ || impl < 0 || impl > IMPL_COUNT )
+ {
+ fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+ exit( NE10_ERR );
+ }
+
+ if ( impl == 0 ) // run all implementations and verify
+ {
+ // first, make sure all of the implementations do exist
+ for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+ {
+ if ( NULL == ftbl[i] )
+ {
+ fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+ exit( NE10_ERR );
+ }
+ }
+
+ // try all the implementatins here..
+ mute = 1; // do not print anything
+
+ // opcode remains the same but we iterate through different implementations here..
+ for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+ {
+ test_operation();
+ }
+
+ // now verify
+ arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+ int warns = 0;
+ int item_width = 1; // determinant is always a scalar value
+ const int item_width_p2 = item_width * item_width;
+ _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width_p2 );
+ for ( i = 0; i < ARRLEN; i++ )
+ {
+ for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+ {
+ memcpy ( &_output[ (impl-1) * item_width_p2 ], &thedst[ impl-1 ][ i * item_width_p2 ], sizeof(arm_float_t) * item_width_p2 );
+ }
+
+ int pos = 0;
+ for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+ {
+ for ( pos = 0; pos < item_width_p2; pos++ ) // compare corresponding components of the items
+ {
+ assert ( _output[ ((1-1)*item_width_p2)+pos ] == _output[ ((1-1)*item_width_p2)+pos ] ); // check for not-a-number
+ assert ( _output[ ((impl-1)*item_width_p2)+pos ] == _output[ ((impl-1)*item_width_p2)+pos ] ); // check for not-a-number
+
+ if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width_p2)+pos ] , _output[ ((impl-1)*item_width_p2)+pos ], ERROR_MARGIN_SMALL ) )
+ { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+ opcode, impl, i, pos+1 );
+ warns++; }
+
+ // stop after 10 warnings
+ if ( warns >= ACCEPTABLE_WARNS_MATRICES )
+ { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+ exit( NE10_ERR );
+ }
+ }
+ }
+ }
+ free( _output ); _output = (arm_float_t *) NULL;
+
+ if ( warns < ACCEPTABLE_WARNS_MATRICES )
+ {
+ return NE10_OK;
+ }
+
+ fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+ exit( NE10_ERR );
+ }
+ else // run a particular implementation
+ {
+ if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+ // ge the overhead
+ MEASURE( dt_test_overhead,
+ for ( i = 0 ; i < max; i++ )
+ {
+ }
+ );
+
+ test_operation();
+ }
+
+
+
+ // free any allocated memory...
+ free( guarded_src );
+ for ( i = 0; i<IMPL_COUNT; i++ )
+ {
+ free( guarded_dst[i] );
+ }
+
+ return NE10_OK;
+}
// ## Operations on Matrices ##
+
+extern arm_result_t (*detmat_4x4f)(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t (*detmat_3x3f)(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t (*detmat_2x2f)(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
+
extern arm_result_t (*invert_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t (*det_mat4x4f)(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t (*trans_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t (*identity_mat4x4f)(arm_mat4x4f_t * dst, unsigned int count);
extern arm_result_t (*invert_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t (*det_mat3x3f)(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t (*trans_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t (*identity_mat3x3f)(arm_mat3x3f_t * dst, unsigned int count);
extern arm_result_t (*invert_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t (*det_mat2x2f)(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t (*trans_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t (*identity_mat2x2f)(arm_mat2x2f_t * dst, unsigned int count);
// ## Operations on Matrices ##
+
+extern arm_result_t detmat_4x4f_asm(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t detmat_3x3f_asm(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t detmat_2x2f_asm(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
+
extern arm_result_t invert_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t det_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t trans_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t identity_mat4x4f_asm(arm_mat4x4f_t * dst, unsigned int count);
extern arm_result_t invert_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t det_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t trans_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t identity_mat3x3f_asm(arm_mat3x3f_t * dst, unsigned int count);
extern arm_result_t invert_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t det_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t trans_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t identity_mat2x2f_asm(arm_mat2x2f_t * dst, unsigned int count);
// ## Operations on Matrices ##
+
+extern arm_result_t detmat_4x4f_c(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t detmat_3x3f_c(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t detmat_2x2f_c(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
+
extern arm_result_t invert_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t det_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t trans_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t identity_mat4x4f_c(arm_mat4x4f_t * dst, unsigned int count);
extern arm_result_t invert_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t det_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t trans_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t identity_mat3x3f_c(arm_mat3x3f_t * dst, unsigned int count);
extern arm_result_t invert_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t det_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t trans_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t identity_mat2x2f_c(arm_mat2x2f_t * dst, unsigned int count);
// ## Operations on Matrices ##
+
+
+extern arm_result_t detmat_4x4f_neon(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t detmat_3x3f_neon(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t detmat_2x2f_neon(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
+
extern arm_result_t invert_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t det_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t trans_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
extern arm_result_t identity_mat4x4f_neon(arm_mat4x4f_t * dst, unsigned int count);
extern arm_result_t invert_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t det_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t trans_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
extern arm_result_t identity_mat3x3f_neon(arm_mat3x3f_t * dst, unsigned int count);
extern arm_result_t invert_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t det_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t trans_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
extern arm_result_t identity_mat2x2f_neon(arm_mat2x2f_t * dst, unsigned int count);
submat
mulmat
mulcmatvec
+detmat
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_detmat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_detmat.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+#include "NE10_detmat.c.h"
+
+#include <assert.h>
+
+arm_result_t detmat_2x2f_c(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ] = DET2x2( &src[ itr ] );
+ );
+}
+
+arm_result_t detmat_3x3f_c(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ] = DET3x3( &(src[ itr ]) );
+
+ );
+}
+
+arm_result_t detmat_4x4f_c(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ] = DET4x4( &src[ itr ] );
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_detmat.c.h
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+inline arm_float_t DET2x2( arm_mat2x2f_t * mat )
+{
+ // 2x2 matrix layout
+ // c1r1 c2r1
+ // c1r2 c2r2
+
+ return ( (mat->c1.r1 * mat->c2.r2)
+ -(mat->c2.r1 * mat->c1.r2) );
+}
+
+inline arm_float_t DET3x3( arm_mat3x3f_t * mat )
+{
+ // 3x3 matrix layout
+ // c1r1 c2r1 c3r1
+ // c1r2 c2r2 c3r2
+ // c1r3 c2r3 c3r3
+
+ arm_mat2x2f_t subm11 = { {mat->c2.r2, mat->c2.r3}, {mat->c3.r2, mat->c3.r3} };
+ arm_mat2x2f_t subm21 = { {mat->c1.r2, mat->c1.r3}, {mat->c3.r2, mat->c3.r3} };
+ arm_mat2x2f_t subm31 = { {mat->c1.r2, mat->c1.r3}, {mat->c2.r2, mat->c2.r3} };
+ return (mat->c1.r1*DET2x2( &subm11 ))
+ - (mat->c2.r1*DET2x2( &subm21 ))
+ + (mat->c3.r1*DET2x2( &subm31 ));
+}
+
+inline arm_float_t DET4x4( arm_mat4x4f_t * mat )
+{
+ // 4x4 matrix layout
+ // c1r1 c2r1 c3r1 c4r1
+ // c1r2 c2r2 c3r2 c4r2
+ // c1r3 c2r3 c3r3 c4r3
+ // c1r4 c2r4 c3r4 c4r4
+
+ arm_mat3x3f_t subm11 = { {mat->c2.r2, mat->c2.r3, mat->c2.r4},
+ {mat->c3.r2, mat->c3.r3, mat->c3.r4},
+ {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
+
+ arm_mat3x3f_t subm21 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
+ {mat->c3.r2, mat->c3.r3, mat->c3.r4},
+ {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
+
+ arm_mat3x3f_t subm31 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
+ {mat->c2.r2, mat->c2.r3, mat->c2.r4},
+ {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
+
+ arm_mat3x3f_t subm41 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
+ {mat->c2.r2, mat->c2.r3, mat->c2.r4},
+ {mat->c3.r2, mat->c3.r3, mat->c3.r4} };
+
+ return (mat->c1.r1*DET3x3( &subm11 ))
+ - (mat->c2.r1*DET3x3( &subm21 ))
+ + (mat->c3.r1*DET3x3( &subm31 ))
+ - (mat->c4.r1*DET3x3( &subm41 ));
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_detmat.neon.inc.s
+@
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro used inside detmat_3x3f_neon() to load 3x3 matrices.
+ @ Two 3x3 matrices are loaded from the source address
+ @ into registers dst00-11. The corresponding qr00-qr05
+ @ registers are then rearranged so the order of the data fits the
+ @ code written in other macros below.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_3x3MATS_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, dst08, dst09, dst10, dst11, qr00, qr01, qr02, qr03, qr04, qr05, addr
+
+ vld3.32 { \dst00, \dst02, \dst04 }, [\addr]!
+ vld3.32 { \dst01[0], \dst03[0], \dst05[0] }, [\addr]!
+ vld3.32 { \dst06, \dst08, \dst10 }, [\addr]!
+ vld3.32 { \dst07[0], \dst09[0], \dst11[0] }, [\addr]!
+
+ vtrn.32 \qr00, \qr03
+ vtrn.32 \qr01, \qr04
+ vtrn.32 \qr02, \qr05
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the determinant of two 3x3 matrices
+ @ loaded using the above LOAD_3x3MATS_ARGS macro.
+ @ The result is stored in the \res register.
+ @ Registers \tmp2 and \tmp3 are used as scratch registers and will
+ @ not be restored in this macro - the caller needs to resotre them
+ @ if needed. Each of the aa-ii parameters can be a "d" register
+ @ containing two floating-point values which correspond to the
+ @ following reference matrix:
+ @
+ @ |aa dd gg|
+ @ M = |bb ee hh|
+ @ |cc ff ii|
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_DETERMINANT_of_3x3MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, res, tmp2, tmp3
+ @ det = a*(ei-fh) - d*(bi-ch) + g*(bf-ec)
+
+ vmul.f32 \res, \ee, \ii @ t1 = ei
+ vmul.f32 \tmp2, \bb, \ii @ t2 = bi
+ vmul.f32 \tmp3, \bb, \ff @ t3 = bf
+
+ vmls.f32 \res, \ff, \hh @ t1 = ei-fh
+ vmls.f32 \tmp2, \cc, \hh @ t2 = bi-ch
+ vmls.f32 \tmp3, \ee, \cc @ t3 = bf-ec
+
+ vmul.f32 \res, \aa, \res @ t1 = a*(ei-fh)
+ vmls.f32 \res, \dd, \tmp2 @ t1 = a*(ei-fh) - d*(bi-ch)
+ vmla.f32 \res, \gg, \tmp3 @ t1 = a*(ei-fh) - d*(bi-ch) + g*(bf-ec) = det(M1), det(M2)
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro used inside detmat_4x4f_neon() to load 4x4 matrices.
+ @ Two 4x4 matrices are loaded from the source address register \addr
+ @ into registers dst00-15. The corresponding qr00-qr07
+ @ registers are then rearranged so the order of the data fits the
+ @ code written in other macros below.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_4x4MATS_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, dst08, dst09, dst10, dst11, dst12, dst13, dst14, dst15, qr00, qr01, qr02, qr03, qr04, qr05, qr06, qr07, addr
+
+ vld4.32 { \dst00, \dst02, \dst04, \dst06 }, [\addr]!
+ vld4.32 { \dst01, \dst03, \dst05, \dst07 }, [\addr]!
+ vld4.32 { \dst08, \dst10, \dst12, \dst14 }, [\addr]!
+ vld4.32 { \dst09, \dst11, \dst13, \dst15 }, [\addr]!
+
+ vtrn.32 \qr00, \qr04
+ vtrn.32 \qr01, \qr05
+ vtrn.32 \qr02, \qr06
+ vtrn.32 \qr03, \qr07
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the determinant of 4x4 matrices
+ @ loaded using the above LOAD_4x4MATS_ARGS macro.
+ @ The result is stored in the \res register.
+ @ Registers \tmp2 to \tmp6 are used as scratch registers and will
+ @ not be restored in this macro - the caller needs to resotre them
+ @ if needed. Each of the aa-pp parameters can be a "d" register
+ @ containing two floating-point values which correspond to the
+ @ following reference matrix:
+ @
+ @ |aa ee ii mm|
+ @ M = |bb ff jj nn|
+ @ |cc gg kk oo|
+ @ |dd hh ll pp|
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_DETERMINANT_of_4x4MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, jj, kk, ll, mm, nn, oo, pp, res, tmp2, tmp3, tmp4, tmp5, tmp6
+
+ @ res = det(SubM11)
+ GET_DETERMINANT_of_3x3MATS_ARGS \ff, \gg, \hh, \jj, \kk, \ll, \nn, \oo, \pp, \res, \tmp5, \tmp6
+
+ @ tmp2 = det(SubM12)
+ GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \jj, \kk, \ll, \nn, \oo, \pp, \tmp2, \tmp5, \tmp6
+
+ @ tmp3 = det(SubM13)
+ GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \ff, \gg, \hh, \nn, \oo, \pp, \tmp3, \tmp5, \tmp6
+
+ @ tmp4 = det(SubM14)
+ GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \ff, \gg, \hh, \jj, \kk, \ll, \tmp4, \tmp5, \tmp6
+
+
+ vmul.f32 \res, \aa, \res
+ vmls.f32 \res, \ee, \tmp2
+ vmla.f32 \res, \ii, \tmp3
+ vmls.f32 \res, \mm, \tmp4
+ .endm
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_detmat.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "headers/NE10header.s"
+.include "source/NE10_detmat.neon.inc.s"
+
+
+
+ .balign 4
+ .global detmat_2x2f_neon
+ .thumb
+ .thumb_func
+
+detmat_2x2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t detmat_2x2f(arm_float_t * dst,
+ @ arm_mat2x2f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 matrices
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cbz r2, .L_check_mat2x2
+
+ @ We load four 2x2 matrices each time, calculate their
+ @ determinants, store the results in the destination
+ @ memory address, and move onto the next four.
+
+ @ load the 1st set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ subs r2, r2, #8 @ 4 for this set, and 4 for the 2nd set
+
+ @ calculate values for the 1st set
+ vmul.f32 q15, q0, q3
+ vmls.f32 q15, q1, q2
+
+ @ load the 2nd set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+
+ ble .L_mainloopend_mat2x2
+
+.L_mainloop_mat2x2:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ vst1.32 {q15}, [r0]!
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ vmul.f32 q15, q0, q3
+ vmls.f32 q15, q1, q2
+
+ @ load the next (e.g. 3rd) set of values
+ subs r2, r2, #4
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+
+ bgt .L_mainloop_mat2x2 @ loop if r2 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ vst1.32 {q15}, [r0]!
+
+ @ calculate values for the last (e.g. 3rd) set
+ vmul.f32 q15, q0, q3
+ vmls.f32 q15, q1, q2
+
+ @ store the result for the last (e.g. 3rd) set
+ vst1.32 {q15}, [r0]!
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vld1.32 {d0, d1}, [r1]! @ Load matrix [A]
+
+ subs r3, r3, #1
+
+ @ calculate det([A]) = |A|
+ vrev64.32 d1, d1
+ vmul.f32 d2, d0, d1
+ vrev64.32 d2, d2
+ vmls.f32 d2, d0, d1 @ At this point d2 = { -|A|, |A| }
+
+ @ store the result which is in d2[1]
+ vst1.32 {d2[1]}, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global detmat_3x3f_neon
+ .thumb
+ .thumb_func
+detmat_3x3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t detmat_3x3f(arm_float_t * dst,
+ @ arm_mat3x3f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 matrices
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r2 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat3x3
+
+ @ We load two 3x3 matrices each time, calculate their
+ @ determinants, store the results in the destination
+ @ memory address, and move onto the next two.
+
+ @ load the 1st set of values
+ LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d16, d17, d18, d19, d20, d21, q0, q1, q2, q8, q9, q10, r1
+
+ subs r2, r2, #4 @ 2 for this set, and 2 for the 2nd set
+
+ @ calculate values for the 1st set
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d16, d18, d20, d1, d3, d5, d22, d24, d26
+
+ @ load the 2nd set of values
+ LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d16, d17, d18, d19, d20, d21, q0, q1, q2, q8, q9, q10, r1
+
+ ble .L_mainloopend_mat3x3
+
+.L_mainloop_mat3x3:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ vst1.32 {d22}, [r0]!
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d16, d18, d20, d1, d3, d5, d22, d24, d26
+
+ @ load the next (e.g. 3rd) set of values
+ LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d16, d17, d18, d19, d20, d21, q0, q1, q2, q8, q9, q10, r1
+
+ subs r2, r2, #2
+
+ bgt .L_mainloop_mat3x3 @ loop if r2 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ vst1.32 {d22}, [r0]!
+
+ @ calculate values for the last (e.g. 3rd) set
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d16, d18, d20, d1, d3, d5, d22, d24, d26
+
+ @ store the result for the last (e.g. 3rd) set
+ vst1.32 {d22}, [r0]!
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+
+ @ load the next (e.g. 3rd) set of values
+ vld3.32 { d0[0], d2[0], d4[0]}, [r1]!
+ vld3.32 { d1[0], d3[0], d5[0]}, [r1]!
+ vld3.32 {d16[0], d18[0], d20[0]}, [r1]!
+
+ subs r3, r3, #1
+
+ @ calculate values for the last (e.g. 3rd) set
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d1, d3, d5, d16, d18, d20, d22, d24, d26
+
+ @ store the result for the last (e.g. 3rd) set
+ vst1.32 {d22[0]}, [r0]!
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global detmat_4x4f_neon
+ .thumb
+ .thumb_func
+detmat_4x4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t detmat_float(arm_float_t * dst,
+ @ arm_mat4x4f_t * src1,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat4x4
+
+
+ @ We load two 4x4 matrices each time, calculate their
+ @ determinants, store the results in the destination
+ @ memory address, and move onto the next two.
+
+ @ load the 1st set of values
+ LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d16, d17, d18, d19, d20, d21, d22, d23, q0, q1, q2, q3, q8, q9, q10, q11, r1
+
+ subs r2, r2, #4 @ 2 for this set and 2 for the next set
+
+ @ calculate values for the 1st set
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d16, d18, d20, d22, d1, d3, d5, d7, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
+
+ @ load the 2nd set of values
+ LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d16, d17, d18, d19, d20, d21, d22, d23, q0, q1, q2, q3, q8, q9, q10, q11, r1
+
+ ble .L_mainloopend_mat4x4
+
+.L_mainloop_mat4x4:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ vst1.32 {d24}, [r0]!
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d16, d18, d20, d22, d1, d3, d5, d7, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
+
+ subs r2, r2, #2
+ @ load the next (e.g. 3rd) set of values
+ LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d16, d17, d18, d19, d20, d21, d22, d23, q0, q1, q2, q3, q8, q9, q10, q11, r1
+
+
+ bgt .L_mainloop_mat4x4 @ loop if xx is > r2, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ vst1.32 {d24}, [r0]!
+
+ @ calculate values for the last (e.g. 3rd) set
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d16, d18, d20, d22, d1, d3, d5, d7, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
+
+ @ store the result for the last (e.g. 3rd) set
+ vst1.32 {d24}, [r0]!
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+ vld4.32 { d0[0], d2[0], d4[0], d6[0]}, [r1]!
+ vld4.32 { d1[0], d3[0], d5[0], d7[0]}, [r1]!
+ vld4.32 { d16[0], d18[0], d20[0], d22[0]}, [r1]!
+ vld4.32 { d17[0], d19[0], d21[0], d23[0]}, [r1]!
+
+
+
+ subs r3, r3, #1
+
+ @ calculate values
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d1, d3, d5, d7, d16, d18, d20, d22, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
+
+ @ store the results
+ vst1.32 {d24[0]}, [r0]!
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ mov r0, #0
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_detmat_test.c
+ */
+
+// TODO: Apply some sort of self-verifying method to the test system.
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN_MATRICES
+// number of the operations in a given unit
+#define OP_COUNT 3
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_detmat_operation_x.h"
+
+extern arm_result_t detmat_2x2f_c (arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t detmat_2x2f_neon(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
+
+extern arm_result_t detmat_3x3f_c (arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t detmat_3x3f_neon(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
+
+extern arm_result_t detmat_4x4f_c (arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t detmat_4x4f_neon(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
+
+void init_ftbl()
+{
+ // manually initialize the global function table with
+ // those functions that do have an actual implementation.
+ ftbl[ 0] = (arm_func_3args_t) detmat_2x2f_c;
+ ftbl[ 1] = (arm_func_3args_t) detmat_2x2f_c; // using the c version in place of the assembly version
+ ftbl[ 2] = (arm_func_3args_t) detmat_2x2f_neon;
+
+ ftbl[ 3] = (arm_func_3args_t) detmat_3x3f_c;
+ ftbl[ 4] = (arm_func_3args_t) detmat_3x3f_c; // using the c version in place of the assembly version
+ ftbl[ 5] = (arm_func_3args_t) detmat_3x3f_neon;
+
+ ftbl[ 6] = (arm_func_3args_t) detmat_4x4f_c;
+ ftbl[ 7] = (arm_func_3args_t) detmat_4x4f_c; // using the c version in place of the assembly version
+ ftbl[ 8] = (arm_func_3args_t) detmat_4x4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+ return run_test( argc, argv ); // defined in "unit_test.h"
+}