acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
}
for (int k = 0; k < 4; k++) {
- acc[k] = vmaxq_f32(
- vdupq_n_f32(output_activation_min),
- vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
- }
- for (int k = 0; k < 4; k++) {
vst1q_f32(output_ptr + 4 * k, acc[k]);
}
output_ptr += 16;
for (; i <= num_output_values - 4; i += 4) {
float32x4_t acc = vld1q_f32(acc_buffer + i);
- acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
- vminq_f32(vdupq_n_f32(output_activation_max), acc));
-
vst1q_f32(output_ptr, acc);
output_ptr += 4;
}