If the remainder is not evenly divisable by 4, we'd miss the check
for zero and continue the loop until crashing. Change the branch
to take into account negatives as well.
This more closely matches the SSE loop.
" vld1.16 {d20}, [%[a]]!\n"
" subs %[remainder], %[remainder], #4\n"
" vmlal.s16 q0, d16, d20\n"
" vld1.16 {d20}, [%[a]]!\n"
" subs %[remainder], %[remainder], #4\n"
" vmlal.s16 q0, d16, d20\n"
"4:"
" vadd.s32 d0, d0, d1\n"
" vpadd.s32 d0, d0, d0\n"
"4:"
" vadd.s32 d0, d0, d1\n"
" vpadd.s32 d0, d0, d0\n"
" subs %[remainder], %[remainder], #4\n"
" vmlal.s16 q0, d16, d24\n"
" vmlal.s16 q1, d20, d24\n"
" subs %[remainder], %[remainder], #4\n"
" vmlal.s16 q0, d16, d24\n"
" vmlal.s16 q1, d20, d24\n"
"4:"
" vld2.16 {d20[], d21[]}, [%[ic]]\n"
" vshrn.s32 d0, q0, #15\n"
"4:"
" vld2.16 {d20[], d21[]}, [%[ic]]\n"
" vshrn.s32 d0, q0, #15\n"
" subs %[remainder], %[remainder], #4\n"
" vmlal.s32 q0, d16, d20\n"
" vmlal.s32 q0, d17, d21\n"
" subs %[remainder], %[remainder], #4\n"
" vmlal.s32 q0, d16, d20\n"
" vmlal.s32 q0, d17, d21\n"
"4:"
" vadd.s64 d0, d0, d1\n"
" vqrshrn.s64 d0, q0, #31\n"
"4:"
" vadd.s64 d0, d0, d1\n"
" vqrshrn.s64 d0, q0, #31\n"
" vld1.32 {q10}, [%[a]]!\n"
" subs %[remainder], %[remainder], #4\n"
" vmla.f32 q0, q6, q10\n"
" vld1.32 {q10}, [%[a]]!\n"
" subs %[remainder], %[remainder], #4\n"
" vmla.f32 q0, q6, q10\n"
"4:"
" vadd.f32 d0, d0, d1\n"
" vpadd.f32 d0, d0, d0\n"
"4:"
" vadd.f32 d0, d0, d1\n"
" vpadd.f32 d0, d0, d0\n"