From 59d2cfa644789a360a2819bbe7891b37e4bfcb43 Mon Sep 17 00:00:00 2001 From: yang Date: Fri, 20 Jul 2012 11:50:26 +0800 Subject: [PATCH] fix bugs in sub --- source/NE10_sub.neon.s | 245 +++++++++++++++++++------------------------------ 1 file changed, 92 insertions(+), 153 deletions(-) diff --git a/source/NE10_sub.neon.s b/source/NE10_sub.neon.s index a29a342..6bd660e 100644 --- a/source/NE10_sub.neon.s +++ b/source/NE10_sub.neon.s @@ -26,7 +26,7 @@ - .balign 4 + .align 4 .global sub_float_neon .thumb .thumb_func @@ -56,42 +56,32 @@ sub_float_neon: cbz r3, .L_check_float @ load the 1st set of values - vld1.32 {q0}, [r1]! - vld1.32 {q1}, [r2]! - subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + vld1.32 {q0}, [r1]! + vld1.32 {q1}, [r2]! + subs r3, r3, #4 @ 4 for this set @ calculate values for the 1st set - vsub.f32 q3, q0, q1 @ q3 = q0 - q1 + vsub.f32 q3, q0, q1 @ q3 = q0 - q1 - @ load the 2nd set of values - vld1.32 {q0}, [r1]! - vld1.32 {q1}, [r2]! - - ble .L_mainloopend_float + ble .L_mainloopend_float .L_mainloop_float: - @ store the result for the 1st/next (e.g. 3rd) set - vst1.32 {d6,d7}, [r0]! + @ store the result for the current set + vst1.32 {d6,d7}, [r0]! - @ calculate values for the 2nd/next (e.g. 3rd) set - vsub.f32 q3, q0, q1 @ q3 = q0 - q1 + @ load the next set of values + vld1.32 {q0}, [r1]! + vld1.32 {q1}, [r2]! + subs r3, r3, #4 - @ load the next (e.g. 3rd) set of values - vld1.32 {q0}, [r1]! - vld1.32 {q1}, [r2]! - subs r3, r3, #4 + @ calculate values for the next set + vsub.f32 q3, q0, q1 @ q3 = q0 - q1 - bgt .L_mainloop_float @ loop if r2 is > r3, if we have at least another 4 floats + bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats .L_mainloopend_float: @ the last iteration for this call - @ store the result for the set of values before the last one (e.g 2nd set) - vst1.32 {d6,d7}, [r0]! - - @ calculate values for the last (e.g. 3rd) set - vsub.f32 q3, q0, q1 @ q3 = q0 - q1 - - @ store the result for the last (e.g. 3rd) set + @ store the result for the last set vst1.32 {d6,d7}, [r0]! @@ -124,7 +114,7 @@ sub_float_neon: - .balign 4 + .align 4 .global sub_vec2f_neon .thumb .thumb_func @@ -154,45 +144,34 @@ sub_vec2f_neon: cbz r3, .L_check_vec2 @ load the 1st set of values - vld2.32 {q0-q1}, [r1]! - vld2.32 {q2-q3}, [r2]! - subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + vld2.32 {q0-q1}, [r1]! + vld2.32 {q2-q3}, [r2]! + subs r3, r3, #4 @ 4 for this set @ calculate values for the 1st set - vsub.f32 q8, q0, q2 - vsub.f32 q9, q1, q3 - - @ load the 2nd set of values - vld2.32 {q0-q1}, [r1]! - vld2.32 {q2-q3}, [r2]! + vsub.f32 q8, q0, q2 + vsub.f32 q9, q1, q3 - ble .L_mainloopend_vec2 + ble .L_mainloopend_vec2 .L_mainloop_vec2: - @ store the result for the 1st/next (e.g. 3rd) set - vst2.32 {d16,d17,d18,d19}, [r0]! + @ store the result for the current set + vst2.32 {d16,d17,d18,d19}, [r0]! - @ calculate values for the 2nd/next (e.g. 3rd) set - vsub.f32 q8, q0, q2 - vsub.f32 q9, q1, q3 + @ load the next set of values + vld2.32 {q0-q1}, [r1]! + vld2.32 {q2-q3}, [r2]! + subs r3, r3, #4 - @ load the next (e.g. 3rd) set of values - vld2.32 {q0-q1}, [r1]! - vld2.32 {q2-q3}, [r2]! - subs r3, r3, #4 + @ calculate values for the next set + vsub.f32 q8, q0, q2 + vsub.f32 q9, q1, q3 - bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process + bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process .L_mainloopend_vec2: @ the last iteration for this call - @ store the result for the set of values before the last one (e.g 2nd set) - vst2.32 {d16,d17,d18,d19}, [r0]! - - @ calculate values for the last (e.g. 3rd) set - vsub.f32 q8, q0, q2 - vsub.f32 q9, q1, q3 - - @ store the result for the last (e.g. 3rd) set + @ store the result for the last set vst2.32 {d16,d17,d18,d19}, [r0]! .L_check_vec2: @@ -223,7 +202,7 @@ sub_vec2f_neon: - .align 2 + .align 4 .global sub_vec3f_neon .thumb .thumb_func @@ -253,56 +232,41 @@ sub_vec3f_neon: beq .L_check_vec3 @ load the 1st set of values - vld3.32 {d0, d2, d4}, [r1]! - vld3.32 {d1, d3, d5}, [r1]! - vld3.32 {d18, d20, d22}, [r2]! - vld3.32 {d19, d21, d23}, [r2]! - subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + vld3.32 {d0, d2, d4}, [r1]! + vld3.32 {d1, d3, d5}, [r1]! + vld3.32 {d18, d20, d22}, [r2]! + vld3.32 {d19, d21, d23}, [r2]! + subs r3, r3, #4 @ 4 for this set @ calculate values for the 1st set - vsub.f32 q12, q0, q9 - vsub.f32 q13, q1, q10 - vsub.f32 q14, q2, q11 - - @ load the 2nd set of values - vld3.32 {d0, d2, d4}, [r1]! - vld3.32 {d1, d3, d5}, [r1]! - vld3.32 {d18, d20, d22}, [r2]! - vld3.32 {d19, d21, d23}, [r2]! + vsub.f32 q12, q0, q9 + vsub.f32 q13, q1, q10 + vsub.f32 q14, q2, q11 - ble .L_mainloopend_vec3 + ble .L_mainloopend_vec3 .L_mainloop_vec3: - @ store the result for the 1st/next (e.g. 3rd) set - vst3.32 {d24, d26, d28}, [r0]! - vst3.32 {d25, d27, d29}, [r0]! + @ store the result for the current set + vst3.32 {d24, d26, d28}, [r0]! + vst3.32 {d25, d27, d29}, [r0]! - @ calculate values for the 2nd/next (e.g. 3rd) set - vsub.f32 q12, q0, q9 - vsub.f32 q13, q1, q10 - vsub.f32 q14, q2, q11 + @ load the next set of values + vld3.32 {d0, d2, d4}, [r1]! + vld3.32 {d1, d3, d5}, [r1]! + vld3.32 {d18, d20, d22}, [r2]! + vld3.32 {d19, d21, d23}, [r2]! + subs r3, r3, #4 - @ load the next (e.g. 3rd) set of values - vld3.32 {d0, d2, d4}, [r1]! - vld3.32 {d1, d3, d5}, [r1]! - vld3.32 {d18, d20, d22}, [r2]! - vld3.32 {d19, d21, d23}, [r2]! - subs r3, r3, #4 + @ calculate values for the next set + vsub.f32 q12, q0, q9 + vsub.f32 q13, q1, q10 + vsub.f32 q14, q2, q11 - bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process + bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process .L_mainloopend_vec3: @ the last iteration for this call - @ store the result for the set of values before the last one (e.g 2nd set) - vst3.32 {d24, d26, d28}, [r0]! - vst3.32 {d25, d27, d29}, [r0]! - - @ calculate values for the last (e.g. 3rd) set - vsub.f32 q12, q0, q9 - vsub.f32 q13, q1, q10 - vsub.f32 q14, q2, q11 - - @ store the result for the last (e.g. 3rd) set + @ store the result for the last set vst3.32 {d24, d26, d28}, [r0]! vst3.32 {d25, d27, d29}, [r0]! @@ -342,7 +306,7 @@ sub_vec3f_neon: - .align 2 + .align 4 .global sub_vec4f_neon .thumb .thumb_func @@ -372,60 +336,44 @@ sub_vec4f_neon: beq .L_check_vec4 @ load the 1st set of values - vld4.32 {d0, d2, d4, d6}, [r1]! - vld4.32 {d1, d3, d5, d7}, [r1]! - vld4.32 {d16, d18, d20, d22}, [r2]! - vld4.32 {d17, d19, d21, d23}, [r2]! + vld4.32 {d0, d2, d4, d6}, [r1]! + vld4.32 {d1, d3, d5, d7}, [r1]! + vld4.32 {d16, d18, d20, d22}, [r2]! + vld4.32 {d17, d19, d21, d23}, [r2]! - subs r3, r3, #8 @ 4 for this set, and 4 for the 2nd set + subs r3, r3, #4 @ 4 for this set @ calculate values for the 1st set - vsub.f32 q12, q0, q8 - vsub.f32 q13, q1, q9 - vsub.f32 q14, q2, q10 - vsub.f32 q15, q3, q11 - - @ load the 2nd set of values - vld4.32 {d0, d2, d4, d6}, [r1]! - vld4.32 {d1, d3, d5, d7}, [r1]! - vld4.32 {d16, d18, d20, d22}, [r2]! - vld4.32 {d17, d19, d21, d23}, [r2]! + vsub.f32 q12, q0, q8 + vsub.f32 q13, q1, q9 + vsub.f32 q14, q2, q10 + vsub.f32 q15, q3, q11 - ble .L_mainloopend_vec4 + ble .L_mainloopend_vec4 .L_mainloop_vec4: - @ store the result for the 1st/next (e.g. 3rd) set - vst4.32 {d24, d26, d28, d30}, [r0]! - vst4.32 {d25, d27, d29, d31}, [r0]! + @ store the result for the current set + vst4.32 {d24, d26, d28, d30}, [r0]! + vst4.32 {d25, d27, d29, d31}, [r0]! - @ calculate values for the 2nd/next (e.g. 3rd) set - vsub.f32 q12, q0, q8 - vsub.f32 q13, q1, q9 - vsub.f32 q14, q2, q10 - vsub.f32 q15, q3, q11 + @ load the next set of values + vld4.32 {d0, d2, d4, d6}, [r1]! + vld4.32 {d1, d3, d5, d7}, [r1]! + vld4.32 {d16, d18, d20, d22}, [r2]! + vld4.32 {d17, d19, d21, d23}, [r2]! + subs r3, r3, #4 - @ load the next (e.g. 3rd) set of values - vld4.32 {d0, d2, d4, d6}, [r1]! - vld4.32 {d1, d3, d5, d7}, [r1]! - vld4.32 {d16, d18, d20, d22}, [r2]! - vld4.32 {d17, d19, d21, d23}, [r2]! - subs r3, r3, #4 + @ calculate values for the next set + vsub.f32 q12, q0, q8 + vsub.f32 q13, q1, q9 + vsub.f32 q14, q2, q10 + vsub.f32 q15, q3, q11 - bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process + bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process .L_mainloopend_vec4: @ the last iteration for this call - @ store the result for the set of values before the last one (e.g 2nd set) - vst4.32 {d24, d26, d28, d30}, [r0]! - vst4.32 {d25, d27, d29, d31}, [r0]! - - @ calculate values for the last (e.g. 3rd) set - vsub.f32 q12, q0, q8 - vsub.f32 q13, q1, q9 - vsub.f32 q14, q2, q10 - vsub.f32 q15, q3, q11 - - @ store the result for the last (e.g. 3rd) set + @ store the result for the last set vst4.32 {d24, d26, d28, d30}, [r0]! vst4.32 {d25, d27, d29, d31}, [r0]! @@ -436,26 +384,17 @@ sub_vec4f_neon: .L_secondloop_vec4: @ process the last few items left in the input array - vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so: - @ q0 = { V1.x, -, -, - }; - @ q1 = { V1.y, -, -, - }; - @ q2 = { V1.z, -, -, - }; - @ q3 = { V1.w, -, -, - }; - vld4.f32 {d1[0], d3[0], d5[0], d7[0]}, [r2]! @ The values are loaded like so: - @ q0 = { V1.x, -, V2.x, - }; - @ q1 = { V1.y, -, V2.y, - }; - @ q2 = { V1.z, -, V2.z, - }; - @ q3 = { V1.w, -, V2.w, - }; + vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so: + @ q0 = { V1.x, V1.y, V1.z, V1.w }; + vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so: + @ q1 = { V2.x, V2.y, V2.z, V2.w }; subs r4, r4, #1 @ calculate values - vsub.f32 d0, d0, d1 - vsub.f32 d2, d2, d3 - vsub.f32 d4, d4, d5 - vsub.f32 d6, d6, d7 + vsub.f32 q0, q0, q1 - vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]! + vst1.32 {d0, d1}, [r0]! bgt .L_secondloop_vec4 -- 2.7.4