- .balign 4
+ .align 4
.global dot_vec2f_neon
.thumb
.thumb_func
@ r3: int count & the number of items in the input array that can be
@ processed in chunks of 4 vectors
@
- @ r4: the number of items that are left to be processed at the end of
+ @ r4: the number of items that are residual that will be processed at the begin of
@ the input array
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
- cbz r3, .L_check_vec2
+ cbz r4, .L_check_mainloop_vec2
- @ load the 1st set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
- subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set
+.L_residualloop_vec2:
+ @ process the residual items in the input array
+ vld1.f32 d0, [r1]!
+ vld1.f32 d1, [r2]!
- @ calculate values for the 1st set
- vmul.f32 q8, q0, q2
- vmla.f32 q8, q1, q3
+ subs r4, r4, #1
- @ load the 2nd set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
+ @ calculate values
+ vmul.f32 d0, d0, d1
+ vpadd.f32 d0, d0
- ble .L_mainloopend_vec2
+ vst1.32 {d0[0]}, [r0]!
-.L_mainloop_vec2:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst1.32 {d16,d17}, [r0]!
+ bgt .L_residualloop_vec2
- @ calculate values for the 2nd/next (e.g. 3rd) set
- vmul.f32 q8, q0, q2
- vmla.f32 q8, q1, q3
+.L_check_mainloop_vec2:
+ cbz r3, .L_return_vec2
- @ load the next (e.g. 3rd) set of values
+ @ load the current set of values
vld2.32 {q0-q1}, [r1]!
vld2.32 {q2-q3}, [r2]!
- subs r3, r3, #4
-
- bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
-.L_mainloopend_vec2:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst1.32 {d16,d17}, [r0]!
-
- @ calculate values for the last (e.g. 3rd) set
- vmul.f32 q8, q0, q2
- vmla.f32 q8, q1, q3
-
- @ store the result for the last (e.g. 3rd) set
- vst1.32 {d16,d17}, [r0]!
-
-.L_check_vec2:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec2
-
-.L_secondloop_vec2:
- @ process the last few items left in the input array
- vld1.f32 d0, [r1]!
- vld1.f32 d1, [r2]!
-
- subs r4, r4, #1
+.L_mainloop_vec2:
+ @ calculate values for current set
+ vmul.f32 q8, q0, q2
+ vmla.f32 q8, q1, q3
- @ calculate values
- vmul.f32 d0, d0, d1
- vpadd.f32 d0, d0
+ @ store the result for current set
+ vst1.32 {d16,d17}, [r0]!
+ subs r3, r3, #1
- vst1.32 {d0[0]}, [r0]!
+ @ load the next set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
- bgt .L_secondloop_vec2
+ bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
.L_return_vec2:
- @ return
+ @ return
pop {r4}
mov r0, #0
bx lr
- .align 2
+ .align 4
.global dot_vec3f_neon
.thumb
.thumb_func
@ r3: int count & the number of items in the input array that can be
@ processed in chunks of 4 vectors
@
- @ r4: the number of items that are left to be processed at the end of
+ @ r4: the number of items that are residual that will be processed at the begin of
@ the input array
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
push {r4}
- and r4, r3, #3 @ r3 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
- cmp r3, #0
- beq .L_check_vec3
+ cbz r4, .L_check_mainloop_vec3
- @ load the 1st set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d16, d18, d20}, [r2]!
- vld3.32 {d17, d19, d21}, [r2]!
- subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set
-
- @ calculate values for the 1st set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
-
- @ load the 2nd set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d16, d18, d20}, [r2]!
- vld3.32 {d17, d19, d21}, [r2]!
-
- ble .L_mainloopend_vec3
-
-.L_mainloop_vec3:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst1.32 {d30, d31}, [r0]!
-
- @ calculate values for the 2nd/next (e.g. 3rd) set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
-
- @ load the next (e.g. 3rd) set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d16, d18, d20}, [r2]!
- vld3.32 {d17, d19, d21}, [r2]!
- subs r3, r3, #4
-
- bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst1.32 {d30, d31}, [r0]!
-
- @ calculate values for the last (e.g. 3rd) set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
-
- @ store the result for the last (e.g. 3rd) set
- vst1.32 {d30, d31}, [r0]!
-
-.L_check_vec3:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec3
-
-.L_secondloop_vec3:
- @ process the last few items left in the input array
+.L_residualloop_vec3:
+ @ process the residual items in the input array
vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
@ q0 = { V1.x, -, -, - };
@ q1 = { V1.y, -, -, - };
vst1.32 {d0[0]}, [r0]!
- bgt .L_secondloop_vec3
+ bgt .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+ cbz r3, .L_return_vec3
+
+ @ load current set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d16, d18, d20}, [r2]!
+ vld3.32 {d17, d19, d21}, [r2]!
+
+.L_mainloop_vec3:
+ @ calculate values for current set
+ vmul.f32 q15, q0, q8
+ vmla.f32 q15, q1, q9
+ vmla.f32 q15, q2, q10
+
+ @ store the result for current set
+ vst1.32 {d30, d31}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d16, d18, d20}, [r2]!
+ vld3.32 {d17, d19, d21}, [r2]!
+
+ bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
.L_return_vec3:
- @ return
+ @ return
pop {r4}
mov r0, #0
bx lr
- .align 2
+ .align 4
.global dot_vec4f_neon
.thumb
.thumb_func
@ r3: int count & the number of items in the input array that can be
@ processed in chunks of 4 vectors
@
- @ r4: the number of items that are left to be processed at the end of
+ @ r4: the number of items that are residual that will be processed at the begin of
@ the input array
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec4
- cmp r3, #0
- beq .L_check_vec4
+.L_residualloop_vec4:
+ @ process the residual items in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, V1.y, V1.z, V1.w };
+ vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
+ @ q1 = { V2.x, V2.y, V2.z, V2.w };
- @ load the 1st set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
+ subs r4, r4, #1
- subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set
+ @ calculate values
+ vmul.f32 q0, q0, q1
+ vadd.f32 d0, d0, d1
+ vpadd.f32 d0, d0
- @ calculate values for the 1st set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
- vmla.f32 q15, q3, q11
+ vst1.32 {d0[0]}, [r0]!
- @ load the 2nd set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
+ bgt .L_residualloop_vec4
- ble .L_mainloopend_vec4
+.L_check_mainloop_vec4:
+ cbz r3, .L_return_vec4
+
+ @ load current set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
.L_mainloop_vec4:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst1.32 {d30, d31}, [r0]!
-
- @ calculate values for the 2nd/next (e.g. 3rd) set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
- vmla.f32 q15, q3, q11
-
- @ load the next (e.g. 3rd) set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
- subs r3, r3, #4
-
- bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_vec4:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst1.32 {d30, d31}, [r0]!
-
- @ calculate values for the last (e.g. 3rd) set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
- vmla.f32 q15, q3, q11
-
- @ store the result for the last (e.g. 3rd) set
- vst1.32 {d30, d31}, [r0]!
-
-.L_check_vec4:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec4
-
-.L_secondloop_vec4:
- @ process the last few items left in the input array
- vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- @ q3 = { V1.w, -, -, - };
- vld4.f32 {d1[0], d3[0], d5[0], d7[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
- @ q3 = { V1.w, -, V2.w, - };
+ @ calculate values for current set
+ vmul.f32 q15, q0, q8
+ vmla.f32 q15, q1, q9
+ vmla.f32 q15, q2, q10
+ vmla.f32 q15, q3, q11
- subs r4, r4, #1
+ @ store the result for current set
+ vst1.32 {d30, d31}, [r0]!
+ subs r3, r3, #1
- @ calculate values
- vmul.f32 d0, d0, d1
- vmla.f32 d0, d2, d3
- vmla.f32 d0, d4, d5
- vmla.f32 d0, d6, d7
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_secondloop_vec4
+ bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
.L_return_vec4:
- @ return
+ @ return
pop {r4}
mov r0, #0
bx lr
+