From 91e7e0eb445f3363f72d795b9caaac9205404dd2 Mon Sep 17 00:00:00 2001 From: Ramin Zaghi Date: Tue, 3 Apr 2012 10:52:18 +0000 Subject: [PATCH] the new vectorized division routines. --- source/NE10_div.c | 30 +++ source/NE10_div.neon.c | 46 ---- source/NE10_div.neon.s | 640 +++++++++++++++++++++++++++++++++++++++++++++++++ source/NE10_div_test.c | 34 ++- 4 files changed, 701 insertions(+), 49 deletions(-) delete mode 100644 source/NE10_div.neon.c create mode 100644 source/NE10_div.neon.s diff --git a/source/NE10_div.c b/source/NE10_div.c index 1348de0..281a42d 100644 --- a/source/NE10_div.c +++ b/source/NE10_div.c @@ -30,3 +30,33 @@ arm_result_t div_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * sr dst[ itr ] = src1[ itr ] / src2[ itr ]; ); } + +arm_result_t vdiv_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count) +{ + NE10_X_OPERATION_FLOAT_C + ( + dst[ itr ].x = src1[ itr ].x / src2[ itr ].x; + dst[ itr ].y = src1[ itr ].y / src2[ itr ].y; + ); +} + +arm_result_t vdiv_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count) +{ + NE10_X_OPERATION_FLOAT_C + ( + dst[ itr ].x = src1[ itr ].x / src2[ itr ].x; + dst[ itr ].y = src1[ itr ].y / src2[ itr ].y; + dst[ itr ].z = src1[ itr ].z / src2[ itr ].z; + ); +} + +arm_result_t vdiv_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count) +{ + NE10_X_OPERATION_FLOAT_C + ( + dst[ itr ].x = src1[ itr ].x / src2[ itr ].x; + dst[ itr ].y = src1[ itr ].y / src2[ itr ].y; + dst[ itr ].z = src1[ itr ].z / src2[ itr ].z; + dst[ itr ].w = src1[ itr ].w / src2[ itr ].w; + ); +} diff --git a/source/NE10_div.neon.c b/source/NE10_div.neon.c deleted file mode 100644 index df90a6a..0000000 --- a/source/NE10_div.neon.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2011-12 ARM Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NE10 Library : source/NE10_div.neon.c - */ - -#include "NE10.h" -#include "../headers/macros.h" - -#include -#include - - #include - #include - -arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count) -{ - NE10_X_OPERATION_FLOAT_NEON - ( - /* a single division operation */ - float32x4_t rec = vrecpeq_f32( n_src2 ); - rec = vmulq_f32(vrecpsq_f32(n_src2, rec), rec); - rec = vmulq_f32(vrecpsq_f32(n_src2, rec), rec); - n_dst = vmulq_f32( n_src , rec ); - , - /* a single division operation */ - float32x2_t rec = vrecpe_f32( n_tmp_src2 ); - rec = vmul_f32(vrecps_f32(n_tmp_src2, rec), rec); - rec = vmul_f32(vrecps_f32(n_tmp_src2, rec), rec); - n_tmp_src = vmul_f32( n_tmp_src, rec ); - ); -} diff --git a/source/NE10_div.neon.s b/source/NE10_div.neon.s new file mode 100644 index 0000000..039f349 --- /dev/null +++ b/source/NE10_div.neon.s @@ -0,0 +1,640 @@ +@ +@ Copyright 2011-12 ARM Limited +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ http://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. +@ + +@ +@ NE10 Library : source/NE10_div.neon.s +@ + + .text + .syntax unified + +.include "headers/NE10header.s" + + + + + .balign 4 + .global div_float_neon + .thumb + .thumb_func + +div_float_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ + @ arm_result_t div_float(arm_float_t * dst, + @ arm_float_t * src1, + @ arm_float_t * src2, + @ unsigned int count) + @ + @ r0: *dst & current dst entry's address + @ r1: *src1 & current src1 entry's address + @ r2: *src2 & current src2 entry's address + @ r3: int count & the number of items in the input array that can be + @ processed in chunks of 4 vectors + @ + @ r4: the number of items that are left to be processed at the end of + @ the input array + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + push {r4} + and r4, r3, #3 @ r4 = count % 4; + sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop + + cbz r3, .L_check_float + + @ load the 1st set of values + vld1.32 {q0}, [r1]! + vld1.32 {q1}, [r2]! + subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + + @ calculate values for the 1st set + vrecpe.f32 q3, q1 + vrecps.f32 q1, q3, q1 + vmul.f32 q3, q1, q3 + vmul.f32 q3, q0, q3 + + + @ load the 2nd set of values + vld1.32 {q0}, [r1]! + vld1.32 {q1}, [r2]! + + ble .L_mainloopend_float + +.L_mainloop_float: + @ store the result for the 1st/next (e.g. 3rd) set + vst1.32 {d6,d7}, [r0]! + + @ calculate values for the 2nd/next (e.g. 3rd) set + vrecpe.f32 q3, q1 + vrecps.f32 q1, q3, q1 + vmul.f32 q3, q1, q3 + vmul.f32 q3, q0, q3 + + @ load the next (e.g. 3rd) set of values + vld1.32 {q0}, [r1]! + vld1.32 {q1}, [r2]! + subs r3, r3, #4 + + bgt .L_mainloop_float @ loop if r2 is > r3, if we have at least another 4 floats + +.L_mainloopend_float: + @ the last iteration for this call + @ store the result for the set of values before the last one (e.g 2nd set) + vst1.32 {d6,d7}, [r0]! + + @ calculate values for the last (e.g. 3rd) set + vrecpe.f32 q3, q1 + vrecps.f32 q1, q3, q1 + vmul.f32 q3, q1, q3 + vmul.f32 q3, q0, q3 + + @ store the result for the last (e.g. 3rd) set + vst1.32 {d6,d7}, [r0]! + + +.L_check_float: + @ check if anything left to process at the end of the input array + cmp r4, #0 + ble .L_return_float + +.L_secondloop_float: + @ process the last few items left in the input array + vld1.f32 d0[0], [r1]! @ Fill in d0[0] + vld1.f32 d1[0], [r2]! @ Fill in d1[1] + + + subs r4, r4, #1 + + @ values d0 = d0 / d1 + vrecpe.f32 d3, d1 + vrecps.f32 d1, d3, d1 + vmul.f32 d3, d1, d3 + vmul.f32 d0, d0, d3 + + vst1.32 {d0[0]}, [r0]! + + bgt .L_secondloop_float + +.L_return_float: + @ return + pop {r4} + mov r0, #0 + bx lr + + + + + .balign 4 + .global vdiv_vec2f_neon + .thumb + .thumb_func + +vdiv_vec2f_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ + @ arm_result_t div_float(arm_vec2f_t * dst, + @ arm_vec2f_t * src1, + @ arm_vec2f_t * src2, + @ unsigned int count) + @ + @ r0: *dst & current dst entry's address + @ r1: *src1 & current src1 entry's address + @ r2: *src2 & current src2 entry's address + @ r3: int count & the number of items in the input array that can be + @ processed in chunks of 4 vectors + @ + @ r4: the number of items that are left to be processed at the end of + @ the input array + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + push {r4} + and r4, r3, #3 @ r4 = count % 4; + sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop + + cmp r3, #0 + beq .L_check_vec2 + + @ load the 1st set of values + vld2.32 {q0-q1}, [r1]! + vld2.32 {q2-q3}, [r2]! + subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + + @ calculate values for the 1st set + @ q8 = q0 / q2 + vrecpe.f32 q8, q2 + vrecps.f32 q2, q8, q2 + vmul.f32 q8, q2, q8 + vmul.f32 q8, q0, q8 + + @ q9 = q1 / q3 + vrecpe.f32 q9, q3 + vrecps.f32 q3, q9, q3 + vmul.f32 q9, q3, q9 + vmul.f32 q9, q1, q9 + + @ load the 2nd set of values + vld2.32 {q0-q1}, [r1]! + vld2.32 {q2-q3}, [r2]! + + ble .L_mainloopend_vec2 + +.L_mainloop_vec2: + @ store the result for the 1st/next (e.g. 3rd) set + vst2.32 {d16,d17,d18,d19}, [r0]! + + @ calculate values for the 2nd/next (e.g. 3rd) set + @ q8 = q0 / q2 + vrecpe.f32 q8, q2 + vrecps.f32 q2, q8, q2 + vmul.f32 q8, q2, q8 + vmul.f32 q8, q0, q8 + + @ q9 = q1 / q3 + vrecpe.f32 q9, q3 + vrecps.f32 q3, q9, q3 + vmul.f32 q9, q3, q9 + vmul.f32 q9, q1, q9 + + @ load the next (e.g. 3rd) set of values + vld2.32 {q0-q1}, [r1]! + vld2.32 {q2-q3}, [r2]! + subs r3, r3, #4 + + bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process + +.L_mainloopend_vec2: + @ the last iteration for this call + @ store the result for the set of values before the last one (e.g 2nd set) + vst2.32 {d16,d17,d18,d19}, [r0]! + + @ calculate values for the last (e.g. 3rd) set + @ q8 = q0 / q2 + vrecpe.f32 q8, q2 + vrecps.f32 q2, q8, q2 + vmul.f32 q8, q2, q8 + vmul.f32 q8, q0, q8 + + @ q9 = q1 / q3 + vrecpe.f32 q9, q3 + vrecps.f32 q3, q9, q3 + vmul.f32 q9, q3, q9 + vmul.f32 q9, q1, q9 + + @ store the result for the last (e.g. 3rd) set + vst2.32 {d16,d17,d18,d19}, [r0]! + +.L_check_vec2: + @ check if anything left to process at the end of the input array + cmp r4, #0 + ble .L_return_vec2 + +.L_secondloop_vec2: + @ process the last few items left in the input array + vld1.f32 d0, [r1]! + vld1.f32 d1, [r2]! + + subs r4, r4, #1 + + @ calculate values + @ d0 = d0 / d1 + vrecpe.f32 d4, d1 + vrecps.f32 d1, d4, d1 + vmul.f32 d4, d1, d4 + vmul.f32 d0, d0, d4 + + vst1.32 {d0}, [r0]! + + bgt .L_secondloop_vec2 + +.L_return_vec2: + @ return + pop {r4} + mov r0, #0 + bx lr + + + + + .align 2 + .global vdiv_vec3f_neon + .thumb + .thumb_func +vdiv_vec3f_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ + @ arm_result_t div_float(arm_vec3f_t * dst, + @ arm_vec3f_t * src1, + @ arm_vec3f_t * src2, + @ unsigned int count) + @ + @ r0: *dst & current dst entry's address + @ r1: *src1 & current src1 entry's address + @ r2: *src2 & current src2 entry's address + @ r3: int count & the number of items in the input array that can be + @ processed in chunks of 4 vectors + @ + @ r4: the number of items that are left to be processed at the end of + @ the input array + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + push {r4} + and r4, r3, #3 @ r3 = count % 4; + sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop + + cmp r3, #0 + beq .L_check_vec3 + + @ load the 1st set of values + vld3.32 {d0, d2, d4}, [r1]! + vld3.32 {d1, d3, d5}, [r1]! + vld3.32 {d18, d20, d22}, [r2]! + vld3.32 {d19, d21, d23}, [r2]! + subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + + @ calculate values for the 1st set + @ q12 = q0 / q9 + vrecpe.f32 q12, q9 + vrecps.f32 q9 , q12, q9 + vmul.f32 q12, q9 , q12 + vmul.f32 q12, q0 , q12 + + @ q13 = q1 / q10 + vrecpe.f32 q13, q10 + vrecps.f32 q10 , q13, q10 + vmul.f32 q13, q10 , q13 + vmul.f32 q13, q1 , q13 + + @ q14 = q2 / q11 + vrecpe.f32 q14, q11 + vrecps.f32 q11 , q14, q11 + vmul.f32 q14, q11 , q14 + vmul.f32 q14, q2 , q14 + + @ load the 2nd set of values + vld3.32 {d0, d2, d4}, [r1]! + vld3.32 {d1, d3, d5}, [r1]! + vld3.32 {d18, d20, d22}, [r2]! + vld3.32 {d19, d21, d23}, [r2]! + + ble .L_mainloopend_vec3 + +.L_mainloop_vec3: + @ store the result for the 1st/next (e.g. 3rd) set + vst3.32 {d24, d26, d28}, [r0]! + vst3.32 {d25, d27, d29}, [r0]! + + @ calculate values for the 2nd/next (e.g. 3rd) set + @ q12 = q0 / q9 + vrecpe.f32 q12, q9 + vrecps.f32 q9 , q12, q9 + vmul.f32 q12, q9 , q12 + vmul.f32 q12, q0 , q12 + + @ q13 = q1 / q10 + vrecpe.f32 q13, q10 + vrecps.f32 q10 , q13, q10 + vmul.f32 q13, q10 , q13 + vmul.f32 q13, q1 , q13 + + @ q14 = q2 / q11 + vrecpe.f32 q14, q11 + vrecps.f32 q11 , q14, q11 + vmul.f32 q14, q11 , q14 + vmul.f32 q14, q2 , q14 + + @ load the next (e.g. 3rd) set of values + vld3.32 {d0, d2, d4}, [r1]! + vld3.32 {d1, d3, d5}, [r1]! + vld3.32 {d18, d20, d22}, [r2]! + vld3.32 {d19, d21, d23}, [r2]! + subs r3, r3, #4 + + bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process + +.L_mainloopend_vec3: + @ the last iteration for this call + @ store the result for the set of values before the last one (e.g 2nd set) + vst3.32 {d24, d26, d28}, [r0]! + vst3.32 {d25, d27, d29}, [r0]! + + @ calculate values for the last (e.g. 3rd) set + @ q12 = q0 / q9 + vrecpe.f32 q12, q9 + vrecps.f32 q9 , q12, q9 + vmul.f32 q12, q9 , q12 + vmul.f32 q12, q0 , q12 + + @ q13 = q1 / q10 + vrecpe.f32 q13, q10 + vrecps.f32 q10 , q13, q10 + vmul.f32 q13, q10 , q13 + vmul.f32 q13, q1 , q13 + + @ q14 = q2 / q11 + vrecpe.f32 q14, q11 + vrecps.f32 q11 , q14, q11 + vmul.f32 q14, q11 , q14 + vmul.f32 q14, q2 , q14 + + @ store the result for the last (e.g. 3rd) set + vst3.32 {d24, d26, d28}, [r0]! + vst3.32 {d25, d27, d29}, [r0]! + +.L_check_vec3: + @ check if anything left to process at the end of the input array + cmp r4, #0 + ble .L_return_vec3 + +.L_secondloop_vec3: + @ process the last few items left in the input array + vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so: + @ q0 = { V1.x, -, -, - }; + @ q1 = { V1.y, -, -, - }; + @ q2 = { V1.z, -, -, - }; + vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so: + @ q0 = { V1.x, -, V2.x, - }; + @ q1 = { V1.y, -, V2.y, - }; + @ q2 = { V1.z, -, V2.z, - }; + + subs r4, r4, #1 + + @ calculate values for + vrecpe.f32 d18, d1 + vrecps.f32 d1 , d18, d1 + vmul.f32 d18, d1 , d18 + vmul.f32 d0 , d0 , d18 + + vrecpe.f32 d20, d3 + vrecps.f32 d3 , d20, d3 + vmul.f32 d20, d3 , d20 + vmul.f32 d2 , d2 , d20 + + vrecpe.f32 d22, d5 + vrecps.f32 d5 , d22, d5 + vmul.f32 d22, d5 , d22 + vmul.f32 d4 , d4 , d22 + + vst3.32 {d0[0], d2[0], d4[0]}, [r0]! + + bgt .L_secondloop_vec3 + +.L_return_vec3: + @ return + pop {r4} + mov r0, #0 + bx lr + + + + + .align 2 + .global vdiv_vec4f_neon + .thumb + .thumb_func +vdiv_vec4f_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ + @ arm_result_t div_float(arm_vec4f_t * dst, + @ arm_vec4f_t * src1, + @ arm_vec4f_t * src2, + @ unsigned int count) + @ + @ r0: *dst & current dst entry's address + @ r1: *src1 & current src1 entry's address + @ r2: *src2 & current src2 entry's address + @ r3: int count & the number of items in the input array that can be + @ processed in chunks of 4 vectors + @ + @ r4: the number of items that are left to be processed at the end of + @ the input array + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + push {r4} + and r4, r3, #3 @ r4 = count % 4; + sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop + + cmp r3, #0 + beq .L_check_vec4 + + @ load the 1st set of values + vld4.32 {d0, d2, d4, d6}, [r1]! + vld4.32 {d1, d3, d5, d7}, [r1]! + vld4.32 {d16, d18, d20, d22}, [r2]! + vld4.32 {d17, d19, d21, d23}, [r2]! + + subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + + @ calculate values for the 1st set + @ q12 = q0 / q8 + vrecpe.f32 q12, q8 + vrecps.f32 q8 , q12, q8 + vmul.f32 q12, q8 , q12 + vmul.f32 q12, q0 , q12 + + @ q13 = q1 / q9 + vrecpe.f32 q13, q9 + vrecps.f32 q9 , q13, q9 + vmul.f32 q13, q9 , q13 + vmul.f32 q13, q1 , q13 + + @ q14 = q2 / q10 + vrecpe.f32 q14, q10 + vrecps.f32 q10 , q14, q10 + vmul.f32 q14, q10 , q14 + vmul.f32 q14, q2 , q14 + + @ q15 = q3 / q11 + vrecpe.f32 q15, q11 + vrecps.f32 q11 , q15, q11 + vmul.f32 q15, q11 , q15 + vmul.f32 q15, q3 , q15 + + @ load the 2nd set of values + vld4.32 {d0, d2, d4, d6}, [r1]! + vld4.32 {d1, d3, d5, d7}, [r1]! + vld4.32 {d16, d18, d20, d22}, [r2]! + vld4.32 {d17, d19, d21, d23}, [r2]! + + ble .L_mainloopend_vec4 + +.L_mainloop_vec4: + @ store the result for the 1st/next (e.g. 3rd) set + vst4.32 {d24, d26, d28, d30}, [r0]! + vst4.32 {d25, d27, d29, d31}, [r0]! + + @ calculate values for the 2nd/next (e.g. 3rd) set + @ q12 = q0 / q8 + vrecpe.f32 q12, q8 + vrecps.f32 q8 , q12, q8 + vmul.f32 q12, q8 , q12 + vmul.f32 q12, q0 , q12 + + @ q13 = q1 / q9 + vrecpe.f32 q13, q9 + vrecps.f32 q9 , q13, q9 + vmul.f32 q13, q9 , q13 + vmul.f32 q13, q1 , q13 + + @ q14 = q2 / q10 + vrecpe.f32 q14, q10 + vrecps.f32 q10 , q14, q10 + vmul.f32 q14, q10 , q14 + vmul.f32 q14, q2 , q14 + + @ q15 = q3 / q11 + vrecpe.f32 q15, q11 + vrecps.f32 q11 , q15, q11 + vmul.f32 q15, q11 , q15 + vmul.f32 q15, q3 , q15 + + @ load the next (e.g. 3rd) set of values + vld4.32 {d0, d2, d4, d6}, [r1]! + vld4.32 {d1, d3, d5, d7}, [r1]! + vld4.32 {d16, d18, d20, d22}, [r2]! + vld4.32 {d17, d19, d21, d23}, [r2]! + subs r3, r3, #4 + + bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process + +.L_mainloopend_vec4: + @ the last iteration for this call + @ store the result for the set of values before the last one (e.g 2nd set) + vst4.32 {d24, d26, d28, d30}, [r0]! + vst4.32 {d25, d27, d29, d31}, [r0]! + + @ calculate values for the last (e.g. 3rd) set + @ q12 = q0 / q8 + vrecpe.f32 q12, q8 + vrecps.f32 q8 , q12, q8 + vmul.f32 q12, q8 , q12 + vmul.f32 q12, q0 , q12 + + @ q13 = q1 / q9 + vrecpe.f32 q13, q9 + vrecps.f32 q9 , q13, q9 + vmul.f32 q13, q9 , q13 + vmul.f32 q13, q1 , q13 + + @ q14 = q2 / q10 + vrecpe.f32 q14, q10 + vrecps.f32 q10 , q14, q10 + vmul.f32 q14, q10 , q14 + vmul.f32 q14, q2 , q14 + + @ q15 = q3 / q11 + vrecpe.f32 q15, q11 + vrecps.f32 q11 , q15, q11 + vmul.f32 q15, q11 , q15 + vmul.f32 q15, q3 , q15 + + @ store the result for the last (e.g. 3rd) set + vst4.32 {d24, d26, d28, d30}, [r0]! + vst4.32 {d25, d27, d29, d31}, [r0]! + +.L_check_vec4: + @ check if anything left to process at the end of the input array + cmp r4, #0 + ble .L_return_vec4 + +.L_secondloop_vec4: + @ process the last few items left in the input array + vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so: + @ q0 = { V1.x, -, -, - }; + @ q1 = { V1.y, -, -, - }; + @ q2 = { V1.z, -, -, - }; + @ q3 = { V1.w, -, -, - }; + vld4.f32 {d1[0], d3[0], d5[0], d7[0]}, [r2]! @ The values are loaded like so: + @ q0 = { V1.x, -, V2.x, - }; + @ q1 = { V1.y, -, V2.y, - }; + @ q2 = { V1.z, -, V2.z, - }; + @ q3 = { V1.w, -, V2.w, - }; + + subs r4, r4, #1 + + @ calculate values + @ d0 = d0 / d1 + vrecpe.f32 d18, d1 + vrecps.f32 d1 , d18, d1 + vmul.f32 d18, d1 , d18 + vmul.f32 d0 , d0 , d18 + + @ d2 = d2 / d3 + vrecpe.f32 d20, d3 + vrecps.f32 d3 , d20, d3 + vmul.f32 d20, d3 , d20 + vmul.f32 d2 , d2 , d20 + + @ d4 = d4 / d5 + vrecpe.f32 d22, d5 + vrecps.f32 d5 , d22, d5 + vmul.f32 d22, d5 , d22 + vmul.f32 d4 , d4 , d22 + + @ d6 = d6 / d7 + vrecpe.f32 d16, d7 + vrecps.f32 d7 , d16, d7 + vmul.f32 d16, d7 , d16 + vmul.f32 d6 , d6 , d16 + + vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]! + + bgt .L_secondloop_vec4 + +.L_return_vec4: + @ return + pop {r4} + mov r0, #0 + bx lr diff --git a/source/NE10_div_test.c b/source/NE10_div_test.c index 8527f97..3a605fc 100644 --- a/source/NE10_div_test.c +++ b/source/NE10_div_test.c @@ -23,20 +23,48 @@ // length of the data arrays #define ARRLEN TEST_ARRLEN // number of the operations in a given unit -#define OP_COUNT 1 +#define OP_COUNT 4 // number of the different implementations of each of the functions (C, ASM, NEON, ...) #define IMPL_COUNT 3 -#include "../headers/unit_test_x_operation_x.h" +#include "../headers/unit_test_x_operation_x_tolerant.h" + +extern arm_result_t div_float_c (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); +//extern arm_result_t div_float_asm (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use +extern arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); + +extern arm_result_t vdiv_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count); +//extern arm_result_t vdiv_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count); +extern arm_result_t vdiv_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count); + +extern arm_result_t vdiv_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count); +//extern arm_result_t vdiv_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count); +extern arm_result_t vdiv_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count); + +extern arm_result_t vdiv_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count); +//extern arm_result_t vdiv_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count); +extern arm_result_t vdiv_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count); void init_ftbl() { // manually initialize the global function table with // those functions that do have an actual implementation. ftbl[ 0] = (arm_func_4args_t) div_float_c; - ftbl[ 1] = (arm_func_4args_t) div_float_asm; + ftbl[ 1] = (arm_func_4args_t) div_float_c; // using the c version in place of the assembly version ftbl[ 2] = (arm_func_4args_t) div_float_neon; + + ftbl[ 3] = (arm_func_4args_t) vdiv_vec2f_c; + ftbl[ 4] = (arm_func_4args_t) vdiv_vec2f_c; // using the c version in place of the assembly version + ftbl[ 5] = (arm_func_4args_t) vdiv_vec2f_neon; + + ftbl[ 6] = (arm_func_4args_t) vdiv_vec3f_c; + ftbl[ 7] = (arm_func_4args_t) vdiv_vec3f_c; // using the c version in place of the assembly version + ftbl[ 8] = (arm_func_4args_t) vdiv_vec3f_neon; + + ftbl[ 9] = (arm_func_4args_t) vdiv_vec4f_c; + ftbl[10] = (arm_func_4args_t) vdiv_vec4f_c; // using the c version in place of the assembly version + ftbl[11] = (arm_func_4args_t) vdiv_vec4f_neon; } arm_result_t main( int argc, char **argv ) -- 2.7.4