- .balign 4
+ .align 4
.global abs_float_neon
.thumb
.thumb_func
@ r1: *src & current src entry's address
@ r2: int count & the number of items in the input array that can be
@ processed in chunks of 4 vectors
- @ r3: the number of items that are left to be processed at the end of
+ @ r3: the number of items that are residual that will be processed at the begin of
@ the input array
@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cbz r2, .L_check_float
-
- @ load the 1st set of values
- vld1.32 {q0}, [r1]!
- subs r2, r2, #8 @ 4 for this set, and 4 for the 2nd set
-
- @ absolute values of the 1st set
- vabs.f32 q3, q0 @ q3 = abs( q0 )
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
- @ load the 2nd set of values
- vld1.32 {q0}, [r1]!
+ cbz r3, .L_check_mainloop_float
- ble .L_mainloopend_float
+.L_residualloop_float:
+ @ process the residual items in the input array
+ vld1.f32 d0[0], [r1]! @ Fill in d0 = { V.x, 0 };
-.L_mainloop_float:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst1.32 {d6,d7}, [r0]!
-
- @ absolute values of the 2nd/next (e.g. 3rd) set
- vabs.f32 q3, q0 @ q3 = abs( q0 )
-
- @ load the next (e.g. 3rd) set of values
- vld1.32 {q0}, [r1]!
- subs r2, r2, #4
+ subs r3, r3, #1
- bgt .L_mainloop_float @ loop if r2 is > r3, if we have at least another 4 floats
+ @ absolute values
+ vabs.f32 d0, d0
-.L_mainloopend_float:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst1.32 {d6,d7}, [r0]!
+ vst1.32 {d0[0]}, [r0]!
- @ absolute values of the last (e.g. 3rd) set
- vabs.f32 q3, q0
+ bgt .L_residualloop_float
- @ store the result for the last (e.g. 3rd) set
- vst1.32 {d6,d7}, [r0]!
+.L_check_mainloop_float:
+ cbz r2, .L_return_float
-.L_check_float:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_float
+ @ load the current set of values
+ vld1.32 {q0}, [r1]! @ for current set
-.L_secondloop_float:
- @ process the last few items left in the input array
- vld1.f32 d0[0], [r1]! @ Fill in d0 = { V.x, V.y };
+.L_mainloop_float:
+ @ absolute values of the current set
+ vabs.f32 q3, q0 @ q3 = abs( q0 )
- subs r3, r3, #1
+ @ store the result for the current set
+ vst1.32 {d6,d7}, [r0]!
- @ absolute values
- vabs.f32 d0, d0
+ subs r2, r2, #1
- vst1.32 {d0[0]}, [r0]!
+ @ load the next set
+ vld1.32 {q0}, [r1]!
- bgt .L_secondloop_float
+ bgt .L_mainloop_float @ loop if r2 > 0, if we have another 4 floats
.L_return_float:
- @ return
+ @ return
mov r0, #0
bx lr
- .balign 4
+ .align 4
.global abs_vec2f_neon
.thumb
.thumb_func
@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cbz r2, .L_check_vec2
-
- @ load the 1st set of values
- vld2.32 {q0-q1}, [r1]!
- subs r2, r2, #8 @ 4 for this set, and 4 for the 2nd set
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
- @ absolute values of the 1st set
- vabs.f32 q3, q0 @ q3 = abs( q0 )
- vabs.f32 q4, q1 @ q4 = abs( q1 )
-
- @ load the 2nd set of values
- vld2.32 {q0-q1}, [r1]!
-
- ble .L_mainloopend_vec2
-
-.L_mainloop_vec2:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst2.32 {d6,d7,d8,d9}, [r0]!
+ cbz r3, .L_check_mainloop_vec2
- @ absolute values of the 2nd/next (e.g. 3rd) set
- vabs.f32 q3, q0 @ q3 = abs( q0 )
- vabs.f32 q4, q1 @ q4 = abs( q1 )
+.L_residualloop_vec2:
+ @ process the residual items in the input array
+ vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
- @ load the next (e.g. 3rd) set of values
- vld2.32 {q0-q1}, [r1]!
- subs r2, r2, #4
+ subs r3, r3, #1
- bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+ @ absolute values
+ vabs.f32 d0, d0
-.L_mainloopend_vec2:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst2.32 {d6,d7,d8,d9}, [r0]!
+ vst1.32 {d0}, [r0]!
- @ absolute values of the last (e.g. 3rd) set
- vabs.f32 q3, q0 @ q3 = abs( q0 )
- vabs.f32 q4, q1 @ q4 = abs( q1 )
+ bgt .L_residualloop_vec2
- @ store the result for the last (e.g. 3rd) set
- vst2.32 {d6,d7,d8,d9}, [r0]!
+.L_check_mainloop_vec2:
+ cbz r2, .L_return_vec2
-.L_check_vec2:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec2
+ @ load the current set of values
+ vld2.32 {q0-q1}, [r1]! @ for current set
-.L_secondloop_vec2:
- @ process the last few items left in the input array
- vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
+.L_mainloop_vec2:
+ @ absolute values of the current set
+ vabs.f32 q3, q0 @ q3 = abs( q0 )
+ vabs.f32 q4, q1 @ q4 = abs( q1 )
- subs r3, r3, #1
+ @ store the result for the current set
+ vst2.32 {d6,d7,d8,d9}, [r0]!
- @ absolute values
- vabs.f32 d0, d0
+ subs r2, r2, #1
- vst1.32 {d0}, [r0]!
+ @ load the next set
+ vld2.32 {q0-q1}, [r1]!
- bgt .L_secondloop_vec2
+ bgt .L_mainloop_vec2 @ loop if r2 > 0, if we have another 4 vec2s
.L_return_vec2:
- @ return
+ @ return
mov r0, #0
bx lr
-
-
- .align 2
+ .align 4
.global abs_vec3f_neon
.thumb
.thumb_func
@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_vec3
-
- @ load the 1st set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- subs r2, r2, #4 @ 4 for this set, and 4 for the 2nd set
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
- @ absolute values of the 1st set
- vabs.f32 q5, q0
- vabs.f32 q6, q1
- vabs.f32 q7, q2
+ cbz r3, .L_check_mainloop_vec3
- @ load the 2nd set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
-
- ble .L_mainloopend_vec3
-
-.L_mainloop_vec3:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst3.32 {d10, d12, d14}, [r0]!
- vst3.32 {d11, d13, d15}, [r0]!
-
- @ absolute values of the 2nd/next (e.g. 3rd) set
- vabs.f32 q5, q0
- vabs.f32 q6, q1
- vabs.f32 q7, q2
-
- @ load the next (e.g. 3rd) set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- subs r2, r2, #4
-
- bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst3.32 {d10, d12, d14}, [r0]!
- vst3.32 {d11, d13, d15}, [r0]!
-
- @ absolute values of the last (e.g. 3rd) set
- vabs.f32 q5, q0
- vabs.f32 q6, q1
- vabs.f32 q7, q2
-
- @ store the result for the last (e.g. 3rd) set
- vst3.32 {d10, d12, d14}, [r0]!
- vst3.32 {d11, d13, d15}, [r0]!
-
-.L_check_vec3:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec3
-
-.L_secondloop_vec3:
- @ process the last few items left in the input array
+.L_residualloop_vec3:
+ @ process the residual items in the input array
vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
@ q0 = { V.x, -, -, - };
@ q1 = { V.y, -, -, - };
@ absolute values
vabs.f32 d0, d0
- vabs.f32 d1, d1
vabs.f32 d2, d2
+ vabs.f32 d4, d4
vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
- bgt .L_secondloop_vec3
+ bgt .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+ cbz r2, .L_return_vec3
+
+ @ load the current set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]! @ for current set
+
+.L_mainloop_vec3:
+ @ absolute values of the current set
+ vabs.f32 q5, q0
+ vabs.f32 q6, q1
+ vabs.f32 q7, q2
+
+ @ store the result for the current set
+ vst3.32 {d10, d12, d14}, [r0]!
+ vst3.32 {d11, d13, d15}, [r0]!
+
+ subs r2, r2, #1
+
+ @ load the next set
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]! @ for next set
+
+ bgt .L_mainloop_vec3 @ loop if r2 > 0, if we have another 4 vec3s
.L_return_vec3:
- @ return
+ @ return
mov r0, #0
bx lr
- .align 2
+ .align 4
.global abs_vec4f_neon
.thumb
.thumb_func
@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
- cmp r2, #0
- beq .L_check_vec4
+ cbz r3, .L_check_mainloop_vec4
+
+.L_residualloop_vec4:
+ @ process the residual items in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V.x, V.y, V.z, V.w };
+ subs r3, r3, #1
+
+ @ absolute values
+ vabs.f32 q0, q0
- @ load the 1st set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #8 @ 4 for this set, and 4 for the 2nd set
+ vst1.32 {d0, d1}, [r0]!
- @ absolute values of the 1st set
- vabs.f32 q10, q0
- vabs.f32 q11, q1
- vabs.f32 q12, q2
- vabs.f32 q13, q3
+ bgt .L_residualloop_vec4
- @ load the 2nd set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
+.L_check_mainloop_vec4:
+ cbz r2, .L_return_vec4
- ble .L_mainloopend_vec4
+ @ load the current set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]! @ for current set
.L_mainloop_vec4:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst4.32 {d20, d22, d24, d26}, [r0]!
- vst4.32 {d21, d23, d25, d27}, [r0]!
-
- @ absolute values of the 2nd/next (e.g. 3rd) set
- vabs.f32 q10, q0
- vabs.f32 q11, q1
- vabs.f32 q12, q2
- vabs.f32 q13, q3
-
- @ load the next (e.g. 3rd) set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #4
-
- bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_vec4:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst4.32 {d20, d22, d24, d26}, [r0]!
- vst4.32 {d21, d23, d25, d27}, [r0]!
-
- @ absolute values of the last (e.g. 3rd) set
- vabs.f32 q10, q0
- vabs.f32 q11, q1
- vabs.f32 q12, q2
- vabs.f32 q13, q3
-
- @ store the result for the last (e.g. 3rd) set
- vst4.32 {d20, d22, d24, d26}, [r0]!
- vst4.32 {d21, d23, d25, d27}, [r0]!
-
-.L_check_vec4:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec4
-
-.L_secondloop_vec4:
- @ process the last few items left in the input array
- vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V.x, -, -, - };
- @ q1 = { V.y, -, -, - };
- @ q2 = { V.z, -, -, - };
- subs r3, r3, #1
+ @ absolute values of the current set
+ vabs.f32 q10, q0
+ vabs.f32 q11, q1
+ vabs.f32 q12, q2
+ vabs.f32 q13, q3
- @ absolute values
- vabs.f32 d0, d0
- vabs.f32 d2, d2
- vabs.f32 d4, d4
- vabs.f32 d6, d6
+ @ store the result for the current set
+ vst4.32 {d20, d22, d24, d26}, [r0]!
+ vst4.32 {d21, d23, d25, d27}, [r0]!
+
+ subs r2, r2, #1
- vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]! @ The values are loaded like so:
+ @ load the next set
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]! @ for next set
- bgt .L_secondloop_vec4
+ bgt .L_mainloop_vec4 @ loop if r2 > 0, if we have another 4 vec4s
.L_return_vec4:
- @ return
+ @ return
mov r0, #0
bx lr
+