24 #ifndef __ARM_COMPUTE_NEASYMM_H__ 25 #define __ARM_COMPUTE_NEASYMM_H__ 73 template <
bool is_bounded_relu>
75 int result_fixedpoint_multiplier,
77 int32x4_t result_offset_after_shift_s32,
81 const static int32x4_t zero_s32 = vdupq_n_s32(0);
84 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
85 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
86 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
87 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
96 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
97 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
98 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
99 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
102 in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
103 in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
104 in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
105 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
108 const int16x8x2_t in_s16 =
111 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
112 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
117 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
121 out_u8 = vmaxq_u8(out_u8, min_u8);
122 out_u8 = vminq_u8(out_u8, max_u8);
129 #endif // __ARM_COMPUTE_NEASYMM_H__ uint8x8x2_t qasymm8x8x2_t
8 bit quantized asymmetric vector with 16 elements
This file contains all available output stages for GEMMLowp on OpenCL.
uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
Perform a multiply-accumulate on all 16 components of a QASYMM8 vector.
uint8x8x3_t qasymm8x8x3_t
8 bit quantized asymmetric vector with 24 elements
uint8x8x4_t qasymm8x8x4_t
8 bit quantized asymmetric vector with 32 elements
uint8x8_t qasymm8x8_t
8 bit quantized asymmetric vector with 8 elements
int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
Round to the nearest division by a power-of-two using exponent.
uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
Performs final quantization step on 16 elements.
uint8x16_t qasymm8x16_t
8 bit quantized asymmetric vector with 16 elements