From: Ramin Zaghi Date: Wed, 4 Apr 2012 11:28:48 +0000 (+0000) Subject: New functions: Matrix multiplication routines. X-Git-Tag: v1.0.0~54^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=67ed64b8cb42caf97c2d92f196ef48ffac38ce9b;p=platform%2Fupstream%2Fne10.git New functions: Matrix multiplication routines. --- diff --git a/NE10_init.c b/NE10_init.c index c6108db..4d3ddeb 100644 --- a/NE10_init.c +++ b/NE10_init.c @@ -125,6 +125,9 @@ arm_result_t NE10_init() submat_2x2f = submat_2x2f_neon; submat_3x3f = submat_3x3f_neon; submat_4x4f = submat_4x4f_neon; + mulmat_2x2f = mulmat_2x2f_neon; + mulmat_3x3f = mulmat_3x3f_neon; + mulmat_4x4f = mulmat_4x4f_neon; } else { @@ -198,6 +201,9 @@ arm_result_t NE10_init() submat_2x2f = submat_2x2f_c; submat_3x3f = submat_3x3f_c; submat_4x4f = submat_4x4f_c; + mulmat_2x2f = mulmat_2x2f_c; + mulmat_3x3f = mulmat_3x3f_c; + mulmat_4x4f = mulmat_4x4f_c; } } @@ -272,4 +278,8 @@ arm_result_t (*addmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x arm_result_t (*submat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count); arm_result_t (*submat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count); arm_result_t (*submat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count); +arm_result_t (*mulmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count); +arm_result_t (*mulmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count); +arm_result_t (*mulmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count); + diff --git a/headers/NE10_random.h b/headers/NE10_random.h index 9253312..686553d 100644 --- a/headers/NE10_random.h +++ b/headers/NE10_random.h @@ -26,6 +26,7 @@ #include #include #include +#include // Please look at http://en.wikipedia.org/wiki/Linear_congruential_generator // According to this page, these values are the ones used in "glibc" @@ -170,8 +171,8 @@ float NE10_float_rng_max() // the same as above functions except the range of values are limited -#define IS_TOO_SMALL(f) ((f<1.0e-6)?1:0) -#define IS_TOO_BIG(f) ((f>1.0e12)?1:0) +#define IS_TOO_SMALL(f) ((fabs(f)<1.0e-6)?1:0) +#define IS_TOO_BIG(f) ((fabs(f)>1.0e12)?1:0) static NE10_float_rng_t __NE10_float_rng_limit; // local array for internal use only @@ -197,5 +198,34 @@ float NE10_float_rng_limit_max() return NE10_float_rng_max_g(NULL); } +// the same as above functions except the range of values are limited and all the values are greater than 1.0e-6 + +#define IS_TOO_SMALL_GT1(f) ((fabs(f)<1.0e-6)?1:0) +#define IS_TOO_BIG_GT1(f) ((fabs(f)>1.0e+3)?1:0) + +static NE10_float_rng_t __NE10_float_rng_limit_gt1; // local array for internal use only + +void NE10_float_rng_limit_gt1_init(uint32_t seed) +{ + NE10_float_rng_init_g( &__NE10_float_rng_limit , seed ); +} + +float NE10_float_rng_limit_gt1_next() +{ + float ret = 0.0f; + + do + { + ret = NE10_float_rng_next_g( &__NE10_float_rng_limit ); + } while ( IS_TOO_SMALL_GT1(ret) || IS_TOO_BIG_GT1(ret) ); + + return ret; +} + +float NE10_float_rng_limit_gt1_max() +{ + return NE10_float_rng_max_g(NULL); +} + #endif // NE10_RANDOM diff --git a/headers/unit_test_common.h b/headers/unit_test_common.h index 6e13412..de55e41 100644 --- a/headers/unit_test_common.h +++ b/headers/unit_test_common.h @@ -77,7 +77,7 @@ #define ACCEPTABLE_WARNS 12 #define ACCEPTABLE_WARNS_MATRICES 48 -inline void FILL_FLOAT_ARRAY( float *arr, unsigned int count ) +inline void FILL_FLOAT_ARRAY( arm_float_t *arr, unsigned int count ) { unsigned int i = 0; @@ -91,7 +91,7 @@ inline void FILL_FLOAT_ARRAY( float *arr, unsigned int count ) } } -inline void FILL_FLOAT_ARRAY_LIMIT( float *arr, unsigned int count ) +inline void FILL_FLOAT_ARRAY_LIMIT( arm_float_t *arr, unsigned int count ) { unsigned int i = 0; @@ -101,7 +101,21 @@ inline void FILL_FLOAT_ARRAY_LIMIT( float *arr, unsigned int count ) for ( i = 0; i < count; i++ ) { - arr[i] = NE10_float_rng_limit_next(); + arr[ i ] = NE10_float_rng_limit_next(); + } +} + +inline void FILL_FLOAT_ARRAY_LIMIT_GT1( arm_float_t *arr, unsigned int count ) +{ + unsigned int i = 0; + + sleep ( 1 ); + + NE10_float_rng_limit_gt1_init( time(NULL) ); + + for ( i = 0; i < count; i++ ) + { + arr[ i ] = NE10_float_rng_limit_gt1_next(); } } diff --git a/headers/unit_test_xmat_operation_x.h b/headers/unit_test_xmat_operation_x.h index 0949b53..9ae0468 100644 --- a/headers/unit_test_xmat_operation_x.h +++ b/headers/unit_test_xmat_operation_x.h @@ -19,6 +19,7 @@ */ #include "./unit_test_common.h" +#include "../inc/NE10_types.h" // This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon') typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count); @@ -64,12 +65,12 @@ arm_result_t test_operation() guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length ); thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16); - FILL_FLOAT_ARRAY( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization + FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length ); thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16); - FILL_FLOAT_ARRAY( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization + FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization for ( i = 0; i + +arm_result_t mulmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count) +{ + #define A1 src1[ itr ].c1.r1 + #define A2 src2[ itr ].c1.r1 + #define B1 src1[ itr ].c1.r2 + #define B2 src2[ itr ].c1.r2 + #define C1 src1[ itr ].c2.r1 + #define C2 src2[ itr ].c2.r1 + #define D1 src1[ itr ].c2.r2 + #define D2 src2[ itr ].c2.r2 + + NE10_X_OPERATION_FLOAT_C + ( + dst[ itr ].c1.r1 = (A1*A2)+(C1*B2); + dst[ itr ].c1.r2 = (B1*A2)+(D1*B2); + + dst[ itr ].c2.r1 = (A1*C2)+(C1*D2); + dst[ itr ].c2.r2 = (B1*C2)+(D1*D2); + ); + + #undef A1 + #undef A2 + #undef B1 + #undef B2 + #undef C1 + #undef C2 + #undef D1 + #undef D2 +} + +arm_result_t mulmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count) +{ + #define A1 src1[ itr ].c1.r1 + #define A2 src2[ itr ].c1.r1 + #define B1 src1[ itr ].c1.r2 + #define B2 src2[ itr ].c1.r2 + #define C1 src1[ itr ].c1.r3 + #define C2 src2[ itr ].c1.r3 + #define D1 src1[ itr ].c2.r1 + #define D2 src2[ itr ].c2.r1 + #define E1 src1[ itr ].c2.r2 + #define E2 src2[ itr ].c2.r2 + #define F1 src1[ itr ].c2.r3 + #define F2 src2[ itr ].c2.r3 + #define G1 src1[ itr ].c3.r1 + #define G2 src2[ itr ].c3.r1 + #define H1 src1[ itr ].c3.r2 + #define H2 src2[ itr ].c3.r2 + #define I1 src1[ itr ].c3.r3 + #define I2 src2[ itr ].c3.r3 + + NE10_X_OPERATION_FLOAT_C + ( + dst[ itr ].c1.r1 = (A1*A2)+(D1*B2)+(G1*C2); + dst[ itr ].c1.r2 = (B1*A2)+(E1*B2)+(H1*C2); + dst[ itr ].c1.r3 = (C1*A2)+(F1*B2)+(I1*C2); + + dst[ itr ].c2.r1 = (A1*D2)+(D1*E2)+(G1*F2); + dst[ itr ].c2.r2 = (B1*D2)+(E1*E2)+(H1*F2); + dst[ itr ].c2.r3 = (C1*D2)+(F1*E2)+(I1*F2); + + dst[ itr ].c3.r1 = (A1*G2)+(D1*H2)+(G1*I2); + dst[ itr ].c3.r2 = (B1*G2)+(E1*H2)+(H1*I2); + dst[ itr ].c3.r3 = (C1*G2)+(F1*H2)+(I1*I2); + ); + + #undef A1 + #undef A2 + #undef B1 + #undef B2 + #undef C1 + #undef C2 + #undef D1 + #undef D2 + #undef E1 + #undef E2 + #undef F1 + #undef F2 + #undef G1 + #undef G2 + #undef H1 + #undef H2 + #undef I1 + #undef I2 +} + +arm_result_t mulmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count) +{ + #define A1 src1[ itr ].c1.r1 + #define A2 src2[ itr ].c1.r1 + #define B1 src1[ itr ].c1.r2 + #define B2 src2[ itr ].c1.r2 + #define C1 src1[ itr ].c1.r3 + #define C2 src2[ itr ].c1.r3 + #define D1 src1[ itr ].c1.r4 + #define D2 src2[ itr ].c1.r4 + + #define E1 src1[ itr ].c2.r1 + #define E2 src2[ itr ].c2.r1 + #define F1 src1[ itr ].c2.r2 + #define F2 src2[ itr ].c2.r2 + #define G1 src1[ itr ].c2.r3 + #define G2 src2[ itr ].c2.r3 + #define H1 src1[ itr ].c2.r4 + #define H2 src2[ itr ].c2.r4 + + #define I1 src1[ itr ].c3.r1 + #define I2 src2[ itr ].c3.r1 + #define J1 src1[ itr ].c3.r2 + #define J2 src2[ itr ].c3.r2 + #define K1 src1[ itr ].c3.r3 + #define K2 src2[ itr ].c3.r3 + #define L1 src1[ itr ].c3.r4 + #define L2 src2[ itr ].c3.r4 + + #define M1 src1[ itr ].c4.r1 + #define M2 src2[ itr ].c4.r1 + #define N1 src1[ itr ].c4.r2 + #define N2 src2[ itr ].c4.r2 + #define O1 src1[ itr ].c4.r3 + #define O2 src2[ itr ].c4.r3 + #define P1 src1[ itr ].c4.r4 + #define P2 src2[ itr ].c4.r4 + + NE10_X_OPERATION_FLOAT_C + ( + dst[ itr ].c1.r1 = (A1*A2)+(E1*B2)+(I1*C2)+(M1*D2); + dst[ itr ].c1.r2 = (B1*A2)+(F1*B2)+(J1*C2)+(N1*D2); + dst[ itr ].c1.r3 = (C1*A2)+(G1*B2)+(K1*C2)+(O1*D2); + dst[ itr ].c1.r4 = (D1*A2)+(H1*B2)+(L1*C2)+(P1*D2); + + dst[ itr ].c2.r1 = (A1*E2)+(E1*F2)+(I1*G2)+(M1*H2); + dst[ itr ].c2.r2 = (B1*E2)+(F1*F2)+(J1*G2)+(N1*H2); + dst[ itr ].c2.r3 = (C1*E2)+(G1*F2)+(K1*G2)+(O1*H2); + dst[ itr ].c2.r4 = (D1*E2)+(H1*F2)+(L1*G2)+(P1*H2); + + dst[ itr ].c3.r1 = (A1*I2)+(E1*J2)+(I1*K2)+(M1*L2); + dst[ itr ].c3.r2 = (B1*I2)+(F1*J2)+(J1*K2)+(N1*L2); + dst[ itr ].c3.r3 = (C1*I2)+(G1*J2)+(K1*K2)+(O1*L2); + dst[ itr ].c3.r4 = (D1*I2)+(H1*J2)+(L1*K2)+(P1*L2); + + dst[ itr ].c4.r1 = (A1*M2)+(E1*N2)+(I1*O2)+(M1*P2); + dst[ itr ].c4.r2 = (B1*M2)+(F1*N2)+(J1*O2)+(N1*P2); + dst[ itr ].c4.r3 = (C1*M2)+(G1*N2)+(K1*O2)+(O1*P2); + dst[ itr ].c4.r4 = (D1*M2)+(H1*N2)+(L1*O2)+(P1*P2); + ); + + #undef A1 + #undef A2 + #undef B1 + #undef B2 + #undef C1 + #undef C2 + #undef D1 + #undef D2 + #undef E1 + #undef E2 + #undef F1 + #undef F2 + #undef G1 + #undef G2 + #undef H1 + #undef H2 + #undef I1 + #undef I2 + #undef J1 + #undef J2 + #undef K1 + #undef K2 + #undef L1 + #undef L2 + #undef M1 + #undef M2 + #undef N1 + #undef N2 + #undef O1 + #undef O2 + #undef P1 + #undef P2 +} diff --git a/source/NE10_mulmat.neon.s b/source/NE10_mulmat.neon.s new file mode 100644 index 0000000..0711990 --- /dev/null +++ b/source/NE10_mulmat.neon.s @@ -0,0 +1,518 @@ +@ COPYRIGHT NOTICE TBD NOT FOR RELEASE + + .text + .syntax unified + +.include "headers/NE10header.s" + + + + + .balign 4 + .global mulmat_2x2f_neon + .thumb + .thumb_func + +mulmat_2x2f_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ + @ arm_result_t mulmat_2x2f(arm_mat2x2f_t * dst, + @ arm_mat2x2f_t * src1, + @ arm_mat2x2f_t * src2, + @ unsigned int count) + @ + @ r0: *dst & current dst entry's address + @ r1: *src1 & current src1 entry's address + @ r2: *src2 & current src2 entry's address + @ r3: int count & the number of items in the input array that can be + @ processed in chunks of 4 vectors + @ + @ r4: the number of items that are left to be processed at the end of + @ the input array + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + push {r4} + and r4, r3, #3 @ r4 = count % 4; + sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop + + cmp r3, #0 + beq .L_check_mat2x2 + + @ We load four 2x2 matrices at a time, multiply them to + @ get two resulting 2x2 matrices, store them in the destination + @ and then move on to the next four matrices. + + @ load the 1st set of values + vld4.32 { d0, d1, d2, d3 }, [r1]! + vld4.32 { d4, d5, d6, d7 }, [r2]! + subs r3, r3, #4 @ 2 for this set, and 2 for the 2nd set + + @ calculate values for the 1st set + vmul.f32 d16, d0, d4 + vmul.f32 d17, d1, d4 + vmul.f32 d18, d0, d6 + vmul.f32 d19, d1, d6 + + vmla.f32 d16, d2, d5 + vmla.f32 d17, d3, d5 + vmla.f32 d18, d2, d7 + vmla.f32 d19, d3, d7 + + + @ load the 2nd set of values + vld4.32 { d0, d1, d2, d3 }, [r1]! + vld4.32 { d4, d5, d6, d7 }, [r2]! + + ble .L_mainloopend_mat2x2 + +.L_mainloop_mat2x2: + @ store the result for the 1st/next (e.g. 3rd) set + vst4.32 { d16, d17, d18, d19}, [r0]! + + @ calculate values for the 2nd/next (e.g. 3rd) set + vmul.f32 d16, d0, d4 + vmul.f32 d17, d1, d4 + vmul.f32 d18, d0, d6 + vmul.f32 d19, d1, d6 + + vmla.f32 d16, d2, d5 + vmla.f32 d17, d3, d5 + vmla.f32 d18, d2, d7 + vmla.f32 d19, d3, d7 + + @ load the next (e.g. 3rd) set of values + subs r3, r3, #2 + vld4.32 { d0, d1, d2, d3 }, [r1]! + vld4.32 { d4, d5, d6, d7 }, [r2]! + + + bgt .L_mainloop_mat2x2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process + +.L_mainloopend_mat2x2: + @ the last iteration for this call + @ store the result for the set of values before the last one (e.g 2nd set) + vst4.32 { d16, d17, d18, d19}, [r0]! + + @ calculate values for the last (e.g. 3rd) set + vmul.f32 d16, d0, d4 + vmul.f32 d17, d1, d4 + vmul.f32 d18, d0, d6 + vmul.f32 d19, d1, d6 + + vmla.f32 d16, d2, d5 + vmla.f32 d17, d3, d5 + vmla.f32 d18, d2, d7 + vmla.f32 d19, d3, d7 + + @ store the result for the last (e.g. 3rd) set + vst4.32 { d16, d17, d18, d19}, [r0]! + + +.L_check_mat2x2: + @ check if anything left to process at the end of the input array + cmp r4, #0 + ble .L_return_mat2x2 + +.L_secondloop_mat2x2: + @ process the last few items left in the input array + vld4.32 { d0[0], d1[0], d2[0], d3[0] }, [r1]! + vld4.32 { d4[0], d5[0], d6[0], d7[0] }, [r2]! + + subs r4, r4, #1 + + @ calculate values + vmul.f32 d16, d0, d4 + vmul.f32 d17, d1, d4 + vmul.f32 d18, d0, d6 + vmul.f32 d19, d1, d6 + + vmla.f32 d16, d2, d5 + vmla.f32 d17, d3, d5 + vmla.f32 d18, d2, d7 + vmla.f32 d19, d3, d7 + + vst4.32 { d16[0], d17[0], d18[0], d19[0] }, [r0]! + + bgt .L_secondloop_mat2x2 + +.L_return_mat2x2: + @ return + pop {r4} + mov r0, #0 + bx lr + + + + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ A macro to load four 3x3 matrices, two from the first source which + @ according to the function signatures is src1 (r1) and + @ another two from the second source which is src2 (r2) + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + .macro LOAD_3x3MATS + + # load two 3x3 matrices from src1 + vld1.32 { q0-q1 }, [r1]! + vld1.32 { d8[0] }, [r1]! + vld1.32 { q2-q3 }, [r1]! + vld1.32 { d8[1] }, [r1]! + + # load two 3x3 matrices from src2 + vld1.32 { q8-q9 }, [r2]! + vld1.32 { d9[0] }, [r2]! + vld1.32 { q10-q11 }, [r2]! + vld1.32 { d9[1] }, [r2]! + + + # rearrange them both + vtrn.32 q0, q2 + vtrn.32 q1, q3 + + vtrn.32 q8, q10 + vtrn.32 q9, q11 + + .endm + + + + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ This macro multiplies two pairs of 3x3 matrices that were + @ loaded using the above LOAD_3x3MATS macro in registers q0-q11. + @ The two resulting matrices are returned in q12, q13, q14, q15, & d9 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + .macro MULTIPLY_3x3MATS + + @ a = d0 & d16 + @ b = d4 & d20 + @ c = d1 & d17 + @ d = d5 & d21 + @ e = d2 & d18 + @ f = d6 & d22 + @ g = d3 & d19 + @ h = d7 & d23 + @ i = d8 & d9 + + vmul.f32 d24, d0, d16 + vmul.f32 d28, d4, d16 + vmul.f32 d25, d1, d16 + vmul.f32 d29, d0, d21 + vmul.f32 d26, d4, d21 + vmul.f32 d30, d1, d21 + vmul.f32 d27, d0, d19 + vmul.f32 d31, d4, d19 + vmul.f32 d10, d1, d19 + + vmla.f32 d24, d5, d20 + vmla.f32 d28, d2, d20 + vmla.f32 d25, d6, d20 + vmla.f32 d29, d5, d18 + vmla.f32 d26, d2, d18 + vmla.f32 d30, d6, d18 + vmla.f32 d27, d5, d23 + vmla.f32 d31, d2, d23 + vmla.f32 d10, d6, d23 + + vmla.f32 d24, d3, d17 + vmla.f32 d28, d7, d17 + vmla.f32 d25, d8, d17 + vmla.f32 d29, d3, d22 + vmla.f32 d26, d7, d22 + vmla.f32 d30, d8, d22 + vmla.f32 d27, d3, d9 + vmla.f32 d31, d7, d9 + vmla.f32 d10, d8, d9 + + .endm + + + + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ A macro to store the two resulting 3x3 matrices from + @ the above MULTIPLY_3x3MATS macro (q12-q15, & d9 are stored) + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + .macro STORE_3x3MATS + + # rearrange them both + vtrn.32 q12, q14 + vtrn.32 q13, q15 + + # store two 3x3 matrices to dst + vst1.32 { q12-q13 }, [r0]! + vst1.32 { d10[0] }, [r0]! + vst1.32 { q14-q15 }, [r0]! + vst1.32 { d10[1] }, [r0]! + + .endm + + + + + .align 2 + .global mulmat_3x3f_neon + .thumb + .thumb_func +mulmat_3x3f_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ + @ arm_result_t mulmat_3x3f(arm_mat3x3f_t * dst, + @ arm_mat3x3f_t * src1, + @ arm_mat3x3f_t * src2, + @ unsigned int count) + @ + @ r0: *dst & current dst entry's address + @ r1: *src1 & current src1 entry's address + @ r2: *src2 & current src2 entry's address + @ r3: int count & the number of items in the input array that can be + @ processed in chunks of 4 vectors + @ + @ r4: the number of items that are left to be processed at the end of + @ the input array + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + push { r4 } + vpush { d8, d9, d10 } + and r4, r3, #3 @ r3 = count % 4; + sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop + + cmp r3, #0 + beq .L_check_mat3x3 + + @ load the 1st set of values + LOAD_3x3MATS + subs r3, r3, #4 @ 2 for this set, and 2 for the 2nd set + + @ calculate values for the 1st set + MULTIPLY_3x3MATS + + @ load the 2nd set of values + LOAD_3x3MATS + ble .L_mainloopend_mat3x3 + +.L_mainloop_mat3x3: + @ store the result for the 1st/next (e.g. 3rd) set + STORE_3x3MATS + + @ calculate values for the 2nd/next (e.g. 3rd) set + MULTIPLY_3x3MATS + + @ load the next (e.g. 3rd) set of values + LOAD_3x3MATS + + subs r3, r3, #2 + + bgt .L_mainloop_mat3x3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process + +.L_mainloopend_mat3x3: + @ the last iteration for this call + @ store the result for the set of values before the last one (e.g 2nd set) + STORE_3x3MATS + + @ calculate values for the last (e.g. 3rd) set + MULTIPLY_3x3MATS + + @ store the result for the last (e.g. 3rd) set + STORE_3x3MATS + +.L_check_mat3x3: + @ check if anything left to process at the end of the input array + cmp r4, #0 + ble .L_return_mat3x3 + +.L_secondloop_mat3x3: + @ process the last few items left in the input array + @ load the next (e.g. 3rd) set of values + vld1.32 { q0-q1 }, [r1]! + vld1.32 { d8[0] }, [r1]! + vld1.32 { q8-q9 }, [r2]! + vld1.32 { d9[0] }, [r2]! + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + + vtrn.32 q8, q10 + vtrn.32 q9, q11 + + subs r4, r4, #1 + + @ calculate values for the last (e.g. 3rd) set + MULTIPLY_3x3MATS + + @ store the result for the last (e.g. 3rd) set + vtrn.32 q12, q14 + vtrn.32 q13, q15 + + vst1.32 { q12-q13 }, [r0]! + vst1.32 { d10[0] }, [r0]! + + + bgt .L_secondloop_mat3x3 + +.L_return_mat3x3: + @ return + vpop { d8, d9, d10 } + pop { r4 } + mov r0, #0 + bx lr + + + + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ A macro to load a pair of 4x4 matrices from src1 (r1) and + @ src2 (r2) into registers q0-q3 & q8-q11. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + .macro LOAD_4x4MATS + + # load a 4x4 matrix from src1 + vld1.32 { q8-q9 }, [r1]! + vld1.32 {q10-q11}, [r1]! + + # load a 4x4 matrix from src2 + vld1.32 {q0-q1}, [r2]! + vld1.32 {q2-q3}, [r2]! + .endm + + + + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ This macro multiplies the two 4x4 matrices loaded in the + @ above LOAD_4x4MATS macro and returns the resulting 4x4 + @ matrix in q12-q15. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + .macro MULTIPLY_4x4MATS + + vmul.f32 q12, q8, d0[0] + vmul.f32 q13, q8, d2[0] + vmul.f32 q14, q8, d4[0] + vmul.f32 q15, q8, d6[0] + + vmla.f32 q12, q9, d0[1] + vmla.f32 q13, q9, d2[1] + vmla.f32 q14, q9, d4[1] + vmla.f32 q15, q9, d6[1] + + + vmla.f32 q12, q10, d1[0] + vmla.f32 q13, q10, d3[0] + vmla.f32 q14, q10, d5[0] + vmla.f32 q15, q10, d7[0] + + vmla.f32 q12, q11, d1[1] + vmla.f32 q13, q11, d3[1] + vmla.f32 q14, q11, d5[1] + vmla.f32 q15, q11, d7[1] + + .endm + + + + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ This macro stores the resulting 4x4 matrix which is + @ returned by the above MULTIPLY_4x4MATS macro from registers + @ q12-q15 into the dst (r0). + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + .macro STORE_4x4MATS + + # store two 3x3 matrices to dst + vst1.32 { q12-q13 }, [r0]! + vst1.32 { q14-q15 }, [r0]! + + .endm + + + + + .align 2 + .global mulmat_4x4f_neon + .thumb + .thumb_func +mulmat_4x4f_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ + @ arm_result_t mulmat_4x4f(arm_mat4x4f_t * dst, + @ arm_mat4x4f_t * src1, + @ arm_mat4x4f_t * src2, + @ unsigned int count) + @ + @ r0: *dst & current dst entry's address + @ r1: *src1 & current src1 entry's address + @ r2: *src2 & current src2 entry's address + @ r3: int count & the number of items in the input array that can be + @ processed in chunks of 4 vectors + @ + @ r4: the number of items that are left to be processed at the end of + @ the input array + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + push {r4} + and r4, r3, #3 @ r4 = count % 4; + sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop + + cmp r3, #0 + beq .L_check_mat4x4 + + @ load the 1st set of values + LOAD_4x4MATS + + subs r3, r3, #2 + + @ calculate values for the 1st set + MULTIPLY_4x4MATS + + @ load the 2nd set of values + LOAD_4x4MATS + + ble .L_mainloopend_mat4x4 + +.L_mainloop_mat4x4: + @ store the result for the 1st/next (e.g. 3rd) set + STORE_4x4MATS + + @ calculate values for the 2nd/next (e.g. 3rd) set + MULTIPLY_4x4MATS + + @ load the next (e.g. 3rd) set of values + subs r3, r3, #1 + LOAD_4x4MATS + + bgt .L_mainloop_mat4x4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process + +.L_mainloopend_mat4x4: + @ the last iteration for this call + @ store the result for the set of values before the last one (e.g 2nd set) + STORE_4x4MATS + + @ calculate values for the last (e.g. 3rd) set + MULTIPLY_4x4MATS + + @ store the result for the last (e.g. 3rd) set + STORE_4x4MATS + +.L_check_mat4x4: + @ check if anything left to process at the end of the input array + cmp r4, #0 + ble .L_return_mat4x4 + +.L_secondloop_mat4x4: + @ process the last few items left in the input array + LOAD_4x4MATS + + subs r4, r4, #1 + + @ calculate values + MULTIPLY_4x4MATS + + @ store the results + STORE_4x4MATS + + bgt .L_secondloop_mat4x4 + +.L_return_mat4x4: + @ return + pop {r4} + mov r0, #0 + bx lr diff --git a/source/NE10_mulmat_test.c b/source/NE10_mulmat_test.c new file mode 100644 index 0000000..a62e5ec --- /dev/null +++ b/source/NE10_mulmat_test.c @@ -0,0 +1,64 @@ +/* + * Copyright 2011-12 ARM Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NE10 Library : source/NE10_mulmat_test.c + */ + +// TODO: Apply some sort of self-verifying method to the test system. + +//Make sure the following are defined before including "unit_test.h" + +// length of the data arrays +#define ARRLEN TEST_ARRLEN_MATRICES +// number of the operations in a given unit +#define OP_COUNT 3 +// number of the different implementations of each of the functions (C, ASM, NEON, ...) +#define IMPL_COUNT 3 + + +#include "../headers/unit_test_xmat_operation_x.h" + +extern arm_result_t mulmat_2x2f_c (arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count); +extern arm_result_t mulmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count); + +extern arm_result_t mulmat_3x3f_c (arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count); +extern arm_result_t mulmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count); + +extern arm_result_t mulmat_4x4f_c (arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count); +extern arm_result_t mulmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count); + +void init_ftbl() +{ + // manually initialize the global function table with + // those functions that do have an actual implementation. + ftbl[ 0] = (arm_func_4args_t) mulmat_2x2f_c; + ftbl[ 1] = (arm_func_4args_t) mulmat_2x2f_c; // using the c version in place of the assembly version + ftbl[ 2] = (arm_func_4args_t) mulmat_2x2f_neon; + + ftbl[ 3] = (arm_func_4args_t) mulmat_3x3f_c; + ftbl[ 4] = (arm_func_4args_t) mulmat_3x3f_c; // using the c version in place of the assembly version + ftbl[ 5] = (arm_func_4args_t) mulmat_3x3f_neon; + + ftbl[ 6] = (arm_func_4args_t) mulmat_4x4f_c; + ftbl[ 7] = (arm_func_4args_t) mulmat_4x4f_c; // using the c version in place of the assembly version + ftbl[ 8] = (arm_func_4args_t) mulmat_4x4f_neon; +} + +arm_result_t main( int argc, char **argv ) +{ + return run_test( argc, argv ); // defined in "unit_test.h" +}