32 static const std::array<qint8x8_t, 4> exp_tab_qs8 =
45 static const std::array<qint16x4_t, 4> exp_tab_qs16 =
58 static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
71 static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
84 static const std::array<qint8x8_t, 4> log_tab_qs8 =
97 static const std::array<qint16x4_t, 4> log_tab_qs16 =
110 static const std::array<qint8x16_t, 4> log_tabq_qs8 =
123 static const std::array<qint16x8_t, 4> log_tabq_qs16 =
127 vdupq_n_s16(-0x56AE),
129 vdupq_n_s16(-0x0AA7),
135 return vget_low_s8(a);
140 return vget_low_s16(a);
145 return vget_high_s8(a);
150 return vget_high_s16(a);
155 return vld1_s8(addr);
160 return vld1_s16(addr);
165 return vld1q_s8(addr);
170 return vld1q_s16(addr);
175 return vld1_dup_s8(addr);
180 return vld1_dup_s16(addr);
185 return vld1q_dup_s8(addr);
190 return vld1q_dup_s16(addr);
195 return vld2q_s16(addr);
225 return vqmovn_s16(a);
230 return vqmovn_s32(a);
240 return vdup_n_s16(a);
245 return vdupq_n_s8(a);
276 return vdupq_n_s16(a);
281 return vdupq_n_s32(a);
321 return vqabsq_s16(a);
326 return vmax_s8(a, b);
331 return vmax_s16(a, b);
336 return vmaxq_s8(a, b);
341 return vpmax_s8(a, b);
346 return vpmax_s16(a, b);
351 return vmaxq_s16(a, b);
356 return vmin_s8(a, b);
361 return vmin_s16(a, b);
366 return vminq_s8(a, b);
371 return vpmin_s8(a, b);
376 return vpmin_s16(a, b);
381 return vminq_s16(a, b);
386 return vadd_s8(a, b);
391 return vadd_s16(a, b);
396 return vaddq_s8(a, b);
401 return vaddq_s16(a, b);
406 return vqadd_s8(a, b);
411 return vqadd_s16(a, b);
416 return vqadd_s32(a, b);
421 return vqaddq_s8(a, b);
426 return vqaddq_s16(a, b);
431 return vqaddq_s32(a, b);
441 return vsub_s8(a, b);
446 return vsub_s16(a, b);
451 return vsubq_s8(a, b);
456 return vsubq_s16(a, b);
461 return vqsub_s8(a, b);
466 return vqsub_s16(a, b);
471 return vqsubq_s8(a, b);
476 return vqsubq_s16(a, b);
481 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
484 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
487 res = vmlal_s8(res, a, b);
490 res = vshlq_s16(res, fixed_point_position_s16);
493 return vmovn_s16(res);
498 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
501 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
504 res = vmlal_s16(res, a, b);
507 res = vshlq_s32(res, fixed_point_position_s32);
510 return vmovn_s32(res);
515 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
518 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
522 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
523 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
526 res0 = vshlq_s16(res0, fixed_point_position_s16);
527 res1 = vshlq_s16(res1, fixed_point_position_s16);
530 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
535 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
538 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
546 res0 = vshlq_s32(res0, fixed_point_position_s32);
547 res1 = vshlq_s32(res1, fixed_point_position_s32);
550 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
555 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
558 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
561 res = vmlal_s8(res, a, b);
564 res = vqshlq_s16(res, fixed_point_position_s16);
567 return vqmovn_s16(res);
572 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
575 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
578 res = vmlal_s16(res, a, b);
581 res = vqshlq_s32(res, fixed_point_position_s32);
584 return vqmovn_s32(res);
589 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
592 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
596 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
597 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
600 res0 = vqshlq_s16(res0, fixed_point_position_s16);
601 res1 = vqshlq_s16(res1, fixed_point_position_s16);
604 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
609 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
612 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
620 res0 = vqshlq_s32(res0, fixed_point_position_s32);
621 res1 = vqshlq_s32(res1, fixed_point_position_s32);
624 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
629 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
633 return vqrshlq_s16(res, fixed_point_position_s16);
638 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
641 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
644 tmp = vmull_s16(a, b);
647 return vqshlq_s32(tmp, fixed_point_position_s32);
652 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
655 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
658 tmp = vmlal_s8(tmp, b, c);
661 tmp = vshlq_s16(tmp, fixed_point_position_s16);
664 return vadd_s8(a, vmovn_s16(tmp));
669 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
672 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
675 tmp = vmlal_s16(tmp, b, c);
678 tmp = vshlq_s32(tmp, fixed_point_position_s32);
681 return vadd_s16(a, vmovn_s32(tmp));
686 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
689 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
693 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
694 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
697 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
698 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
701 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
706 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
709 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
717 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
718 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
726 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
729 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
732 tmp = vmlal_s8(tmp, b, c);
735 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
738 return vqadd_s8(a, vqmovn_s16(tmp));
743 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
746 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
749 tmp = vmlal_s16(tmp, b, c);
752 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
755 return vqadd_s16(a, vqmovn_s32(tmp));
760 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
763 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
767 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
768 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
771 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
772 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
775 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
776 return vqaddq_s8(a, res);
781 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
784 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
792 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
793 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
796 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
797 return vqaddq_s16(a, res);
802 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
805 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
808 tmp = vmlal_s8(tmp, b, c);
811 tmp = vshlq_s16(tmp, fixed_point_position_s16);
814 return vaddq_s16(a, tmp);
819 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
822 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
825 tmp = vmlal_s16(tmp, b, c);
828 tmp = vshlq_s32(tmp, fixed_point_position_s32);
831 return vaddq_s32(a, tmp);
836 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
839 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
842 tmp = vmlal_s8(tmp, b, c);
845 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
848 return vqaddq_s16(a, tmp);
853 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
856 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
859 tmp = vmlal_s16(tmp, b, c);
862 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
865 return vqaddq_s32(a, tmp);
870 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
872 float32x4x2_t res_f32 =
875 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
876 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
880 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
881 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
883 const int32x4x2_t res_s32 =
886 vcvtq_s32_f32(res_f32.val[0]),
887 vcvtq_s32_f32(res_f32.val[1]),
891 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
893 return vqmovn_s16(res_s16);
898 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
900 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
902 res_f32 = vmlaq_f32(res_f32, a, pow2);
904 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
906 return vqmovn_s32(res_s32);
911 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
913 float32x4x4_t res_f32 =
916 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
917 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
918 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
919 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
923 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
924 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
925 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
926 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
928 const int32x4x4_t res_s32 =
931 vcvtq_s32_f32(res_f32.val[0]),
932 vcvtq_s32_f32(res_f32.val[1]),
933 vcvtq_s32_f32(res_f32.val[2]),
934 vcvtq_s32_f32(res_f32.val[3]),
938 const int16x8x2_t res_s16 =
941 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
942 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
946 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
951 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
953 float32x4x2_t res_f32 =
956 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
957 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
961 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
962 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
964 const int32x4x2_t res_s32 =
967 vcvtq_s32_f32(res_f32.val[0]),
968 vcvtq_s32_f32(res_f32.val[1])
972 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
977 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
979 const int16x8_t res_s16 = vmovl_s8(a);
981 const int32x4x2_t res_s32 =
989 float32x4x2_t res_f32 =
992 vcvtq_f32_s32(res_s32.val[0]),
993 vcvtq_f32_s32(res_s32.val[1])
997 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
998 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1005 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1006 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
1008 return vmulq_f32(res_f32, pow2);
1013 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1015 const int16x8x2_t res_s16 =
1018 vmovl_s8(vget_low_s8(a)),
1019 vmovl_s8(vget_high_s8(a)),
1023 const int32x4x4_t res_s32 =
1033 float32x4x4_t res_f32 =
1036 vcvtq_f32_s32(res_s32.val[0]),
1037 vcvtq_f32_s32(res_s32.val[1]),
1038 vcvtq_f32_s32(res_s32.val[2]),
1039 vcvtq_f32_s32(res_s32.val[3])
1043 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1044 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1045 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1046 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1053 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1055 const int32x4x2_t res_s32 =
1063 float32x4x2_t res_f32 =
1066 vcvtq_f32_s32(res_s32.val[0]),
1067 vcvtq_f32_s32(res_s32.val[1])
1071 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1072 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1080 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position));
1081 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position)));
1082 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1083 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1086 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1087 const qint8x8_t temp = vshl_s8(a, shift_value);
1090 qint8x8_t x = vsub_s8(const_48_over_17,
vmul_qs8(temp, const_32_over_17, fixed_point_position));
1092 uint8x8_t set_one = vcgt_s8(x, const_one);
1093 x = vbsl_s8(set_one, const_one, x);
1096 x =
vmul_qs8(x, vsub_s8(const_two,
vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1097 x =
vmul_qs8(x, vsub_s8(const_two,
vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1098 x =
vmul_qs8(x, vsub_s8(const_two,
vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1100 return vshl_s8(x, shift_value);
1106 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position));
1107 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position));
1108 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1109 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1112 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1113 const qint16x4_t temp = vshl_s16(a, shift_value);
1116 qint16x4_t x = vsub_s16(const_48_over_17,
vmul_qs16(temp, const_32_over_17, fixed_point_position));
1118 uint16x4_t set_one = vcgt_s16(x, const_one);
1119 x = vbsl_s16(set_one, const_one, x);
1122 x =
vmul_qs16(x, vsub_s16(const_two,
vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1123 x =
vmul_qs16(x, vsub_s16(const_two,
vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1124 x =
vmul_qs16(x, vsub_s16(const_two,
vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1125 x =
vmul_qs16(x, vsub_s16(const_two,
vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1127 return vshl_s16(x, shift_value);
1133 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position));
1134 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position)));
1135 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1136 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1139 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1140 const qint8x8_t temp = vqshl_s8(a, shift_value);
1143 qint8x8_t x = vqsub_s8(const_48_over_17,
vqmul_qs8(temp, const_32_over_17, fixed_point_position));
1145 uint8x8_t set_one = vcgt_s8(x, const_one);
1146 x = vbsl_s8(set_one, const_one, x);
1149 x =
vqmul_qs8(x, vqsub_s8(const_two,
vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1150 x =
vqmul_qs8(x, vqsub_s8(const_two,
vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1151 x =
vqmul_qs8(x, vqsub_s8(const_two,
vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1153 return vqshl_s8(x, shift_value);
1159 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position));
1160 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position));
1161 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1162 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1165 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1166 const qint16x4_t temp = vqshl_s16(a, shift_value);
1169 qint16x4_t x = vqsub_s16(const_48_over_17,
vqmul_qs16(temp, const_32_over_17, fixed_point_position));
1171 uint16x4_t set_one = vcgt_s16(x, const_one);
1172 x = vbsl_s16(set_one, const_one, x);
1175 x =
vqmul_qs16(x, vqsub_s16(const_two,
vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1176 x =
vqmul_qs16(x, vqsub_s16(const_two,
vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1177 x =
vqmul_qs16(x, vqsub_s16(const_two,
vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1178 x =
vqmul_qs16(x, vqsub_s16(const_two,
vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1180 return vqshl_s16(x, shift_value);
1186 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position));
1187 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position)));
1188 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1189 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1192 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1193 const qint8x16_t temp = vshlq_s8(a, shift_value);
1199 uint8x16_t set_one = vcgtq_s8(x, const_one);
1200 x = vbslq_s8(set_one, const_one, x);
1203 x =
vmulq_qs8(x, vsubq_s8(const_two,
vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1204 x =
vmulq_qs8(x, vsubq_s8(const_two,
vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1205 x =
vmulq_qs8(x, vsubq_s8(const_two,
vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1207 return vshlq_s8(x, shift_value);
1213 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position));
1214 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position));
1215 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1216 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1219 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1220 const qint16x8_t temp = vshlq_s16(a, shift_value);
1226 uint16x8_t set_one = vcgtq_s16(x, const_one);
1227 x = vbslq_s16(set_one, const_one, x);
1230 x =
vmulq_qs16(x, vsubq_s16(const_two,
vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1231 x =
vmulq_qs16(x, vsubq_s16(const_two,
vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1232 x =
vmulq_qs16(x, vsubq_s16(const_two,
vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1233 x =
vmulq_qs16(x, vsubq_s16(const_two,
vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1235 return vshlq_s16(x, shift_value);
1241 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position));
1242 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position)));
1243 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1244 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1247 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1248 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1254 uint8x16_t set_one = vcgtq_s8(x, const_one);
1255 x = vbslq_s8(set_one, const_one, x);
1258 x =
vqmulq_qs8(x, vqsubq_s8(const_two,
vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1259 x =
vqmulq_qs8(x, vqsubq_s8(const_two,
vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1260 x =
vqmulq_qs8(x, vqsubq_s8(const_two,
vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1262 return vqshlq_s8(x, shift_value);
1268 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position));
1269 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position));
1270 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1271 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1274 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1275 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1281 uint16x8_t set_one = vcgtq_s16(x, const_one);
1282 x = vbslq_s16(set_one, const_one, x);
1285 x =
vqmulq_qs16(x, vqsubq_s16(const_two,
vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1286 x =
vqmulq_qs16(x, vqsubq_s16(const_two,
vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1287 x =
vqmulq_qs16(x, vqsubq_s16(const_two,
vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1288 x =
vqmulq_qs16(x, vqsubq_s16(const_two,
vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1314 template <
bool islog>
1317 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1318 const qint8x8_t const_one = vdup_n_s8(1);
1319 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1320 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1321 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1322 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1330 template <
bool islog>
1333 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1335 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1336 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1337 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1338 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1346 template <
bool islog>
1349 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1350 const qint8x8_t const_one = vdup_n_s8(1);
1351 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1352 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1353 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1354 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1362 template <
bool islog>
1365 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1367 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1368 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1369 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1370 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1378 template <
bool islog>
1381 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1383 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1384 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1385 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1386 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1394 template <
bool islog>
1397 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1399 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1400 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1401 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1402 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1410 template <
bool islog>
1413 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1415 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1416 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1417 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1418 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1426 template <
bool islog>
1429 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1431 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1432 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1433 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1434 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1444 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1445 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1446 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value);
1447 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one);
1453 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1455 qint8x8_t alpha =
vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1459 qint8x8_t poly = vqtaylor_poly_qs8<false>(
alpha, fixed_point_position);
1460 poly = vqadd_s8(poly, const_one);
1463 poly = vqshl_s8(poly, dec_m);
1470 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1471 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1472 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value);
1473 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one);
1479 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1485 qint16x4_t poly = vqtaylor_poly_qs16<false>(
alpha, fixed_point_position);
1486 poly = vqadd_s16(poly, const_one);
1489 poly = vqshl_s16(poly, dec_m);
1496 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1497 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1498 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value);
1499 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one);
1505 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1511 qint8x16_t poly = vqtaylor_polyq_qs8<false>(
alpha, fixed_point_position);
1512 poly = vqaddq_s8(poly, const_one);
1515 poly = vqshlq_s8(poly, dec_m);
1522 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1523 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1524 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value);
1525 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one);
1531 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1537 qint16x8_t poly = vqtaylor_polyq_qs16<false>(
alpha, fixed_point_position);
1538 poly = vqaddq_s16(poly, const_one);
1541 poly = vqshlq_s16(poly, dec_m);
1548 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1549 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1550 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position));
1553 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1555 recip = vbsl_s8(calc_reciprocal, recip, a);
1558 recip =
vrecip_qs8(recip, fixed_point_position);
1559 a = vbsl_s8(calc_reciprocal, recip, a);
1562 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1563 qint8x8_t dec_a = vshl_s8(a, shift_value);
1566 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1569 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1570 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1574 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1577 poly =
vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1580 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1587 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1588 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1589 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position));
1592 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1594 recip = vbsl_s16(calc_reciprocal, recip, a);
1598 a = vbsl_s16(calc_reciprocal, recip, a);
1601 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1605 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1608 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1609 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1613 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1616 poly =
vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1619 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1626 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1627 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1628 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position));
1631 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1633 recip = vbslq_s8(calc_reciprocal, a, recip);
1637 a = vbslq_s8(calc_reciprocal, recip, a);
1640 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1644 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1647 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1648 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1652 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1655 poly =
vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1658 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1665 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1666 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1667 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position));
1670 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1672 recip = vbslq_s16(calc_reciprocal, a, recip);
1676 a = vbslq_s16(calc_reciprocal, recip, a);
1679 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1680 qint16x8_t dec_a = vshlq_s16(a, shift_value);
1683 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1686 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1687 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1691 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1694 poly =
vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1697 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1704 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1707 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1710 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1711 uint8x8_t temp_ltz = vclt_s8(temp,
vdup_n_qs8(0));
1712 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1713 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1715 temp = vshl_s8(a, shift_value);
1722 x = vshr_n_s8(
vmul_qs8(x, vsub_s8(const_three,
vmul_qs8(temp,
vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1723 x = vshr_n_s8(
vmul_qs8(x, vsub_s8(const_three,
vmul_qs8(temp,
vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1724 x = vshr_n_s8(
vmul_qs8(x, vsub_s8(const_three,
vmul_qs8(temp,
vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1726 return vshl_s8(x, shift_value2);
1731 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1734 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1737 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1738 uint16x4_t temp_ltz = vclt_s16(temp,
vdup_n_qs16(0));
1739 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1740 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1742 temp = vshl_s16(a, shift_value);
1749 x = vshr_n_s16(
vmul_qs16(x, vsub_s16(const_three,
vmul_qs16(temp,
vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1750 x = vshr_n_s16(
vmul_qs16(x, vsub_s16(const_three,
vmul_qs16(temp,
vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1751 x = vshr_n_s16(
vmul_qs16(x, vsub_s16(const_three,
vmul_qs16(temp,
vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1752 x = vshr_n_s16(
vmul_qs16(x, vsub_s16(const_three,
vmul_qs16(temp,
vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1753 x = vshr_n_s16(
vmul_qs16(x, vsub_s16(const_three,
vmul_qs16(temp,
vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1755 return vshl_s16(x, shift_value2);
1760 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1763 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1766 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1767 uint8x8_t temp_ltz = vclt_s8(temp,
vdup_n_qs8(0));
1768 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
1769 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
1771 temp = vqshl_s8(a, shift_value);
1778 x = vshr_n_s8(
vqmul_qs8(x, vqsub_s8(const_three,
vqmul_qs8(temp,
vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1779 x = vshr_n_s8(
vqmul_qs8(x, vqsub_s8(const_three,
vqmul_qs8(temp,
vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1780 x = vshr_n_s8(
vqmul_qs8(x, vqsub_s8(const_three,
vqmul_qs8(temp,
vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1782 return vqshl_s8(x, shift_value2);
1787 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1790 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1793 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1794 uint16x4_t temp_ltz = vclt_s16(temp,
vdup_n_qs16(0));
1795 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
1796 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
1798 temp = vqshl_s16(a, shift_value);
1805 x = vshr_n_s16(
vqmul_qs16(x, vqsub_s16(const_three,
vqmul_qs16(temp,
vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1806 x = vshr_n_s16(
vqmul_qs16(x, vqsub_s16(const_three,
vqmul_qs16(temp,
vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1807 x = vshr_n_s16(
vqmul_qs16(x, vqsub_s16(const_three,
vqmul_qs16(temp,
vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1808 x = vshr_n_s16(
vqmul_qs16(x, vqsub_s16(const_three,
vqmul_qs16(temp,
vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1809 x = vshr_n_s16(
vqmul_qs16(x, vqsub_s16(const_three,
vqmul_qs16(temp,
vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1811 return vqshl_s16(x, shift_value2);
1816 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1819 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1822 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1823 uint8x16_t temp_ltz = vcltq_s8(temp,
vdupq_n_qs8(0));
1824 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1825 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1827 temp = vshlq_s8(a, shift_value);
1834 x = vshrq_n_s8(
vmulq_qs8(x, vsubq_s8(const_three,
vmulq_qs8(temp,
vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1835 x = vshrq_n_s8(
vmulq_qs8(x, vsubq_s8(const_three,
vmulq_qs8(temp,
vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1836 x = vshrq_n_s8(
vmulq_qs8(x, vsubq_s8(const_three,
vmulq_qs8(temp,
vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1838 return vshlq_s8(x, shift_value2);
1843 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1846 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1849 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1850 uint16x8_t temp_ltz = vcltq_s16(temp,
vdupq_n_qs16(0));
1851 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1852 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1854 temp = vshlq_s16(a, shift_value);
1861 x = vshrq_n_s16(
vmulq_qs16(x, vsubq_s16(const_three,
vmulq_qs16(temp,
vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1862 x = vshrq_n_s16(
vmulq_qs16(x, vsubq_s16(const_three,
vmulq_qs16(temp,
vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1863 x = vshrq_n_s16(
vmulq_qs16(x, vsubq_s16(const_three,
vmulq_qs16(temp,
vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1864 x = vshrq_n_s16(
vmulq_qs16(x, vsubq_s16(const_three,
vmulq_qs16(temp,
vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1865 x = vshrq_n_s16(
vmulq_qs16(x, vsubq_s16(const_three,
vmulq_qs16(temp,
vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1867 return vshlq_s16(x, shift_value2);
1872 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1875 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1878 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1879 uint8x16_t temp_ltz = vcltq_s8(temp,
vdupq_n_qs8(0));
1880 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
1881 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
1883 temp = vqshlq_s8(a, shift_value);
1890 x = vshrq_n_s8(
vqmulq_qs8(x, vqsubq_s8(const_three,
vqmulq_qs8(temp,
vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1891 x = vshrq_n_s8(
vqmulq_qs8(x, vqsubq_s8(const_three,
vqmulq_qs8(temp,
vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1892 x = vshrq_n_s8(
vqmulq_qs8(x, vqsubq_s8(const_three,
vqmulq_qs8(temp,
vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1894 return vqshlq_s8(x, shift_value2);
1899 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1902 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1905 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1906 uint16x8_t temp_ltz = vcltq_s16(temp,
vdupq_n_qs16(0));
1907 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
1908 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
1910 temp = vqshlq_s16(a, shift_value);
1917 x = vshrq_n_s16(
vqmulq_qs16(x, vqsubq_s16(const_three,
vqmulq_qs16(temp,
vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1918 x = vshrq_n_s16(
vqmulq_qs16(x, vqsubq_s16(const_three,
vqmulq_qs16(temp,
vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1919 x = vshrq_n_s16(
vqmulq_qs16(x, vqsubq_s16(const_three,
vqmulq_qs16(temp,
vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1920 x = vshrq_n_s16(
vqmulq_qs16(x, vqsubq_s16(const_three,
vqmulq_qs16(temp,
vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1921 x = vshrq_n_s16(
vqmulq_qs16(x, vqsubq_s16(const_three,
vqmulq_qs16(temp,
vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1923 return vqshlq_s16(x, shift_value2);
1928 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1929 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1941 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1942 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1954 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1955 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1967 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1968 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1993 vmaxq_f32(a.val[0], b.val[0]),
1994 vmaxq_f32(a.val[1], b.val[1])
qint16x8x2_t vld2q_qs16(qint16_t *addr)
Load two 16 bit fixed point vectors from memory (8x2 elements)
qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
Perform a 4th degree polynomial approximation.
qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position)
Division fixed point 8bit (8 elements)
qint32x4_t vdupq_n_qs32(qint32_t a)
qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
16 bit fixed point vector saturating subtraction (8 elements)
qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector add (8 elements)
qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
8 bit fixed point vector add (16 elements)
qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector subtraction (4 elements)
qint8x8_t vqabs_qs8(qint8x8_t a)
Saturating absolute value of 8 bit fixed point vector (8 elements)
qint8x8_t vld1_qs8(const qint8_t *addr)
Load a single 8 bit fixed point vector from memory (8 elements)
qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
8 bit fixed point vector multiply-accumulate (16 elements).
qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
16 bit fixed point vector saturating multiply (4 elements)
qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
16 bit fixed point vector subtraction (8 elements)
qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
Calculate logarithm fixed point 16bit (16 elements)
qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method.
int16x8x2_t qint16x8x2_t
16 bit fixed point vector with 16 elements
qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Calculate hyperbolic tangent for fixed point 16 bit (4 elements)
qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
Calculate saturating n power for fixed point 16bit (8 elements).
qint16x4_t vqabs_qs16(qint16x4_t a)
Saturating absolute value of 16 bit fixed point vector (4 elements)
qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements) ...
qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
Division fixed point 16 bit (4 elements)
qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector subtraction (8 elements)
qint16x8_t vqabsq_qs16(qint16x8_t a)
Saturating absolute value of 16 bit fixed point vector (8 elements)
qint8x8_t vqcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position)
Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements.
qint16x4_t vld1_qs16(const qint16_t *addr)
Load a single 16 bit fixed point vector from memory (4 elements)
qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements.
qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) ...
void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
Store two 16 bit fixed point vector to memory (8x2 elements)
qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
16 bit fixed point vector multiply-accumulate (16 elements).
qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector add (4 elements)
qint16x4_t vget_high_qs16(qint16x8_t a)
Get the higher half of a 16 elements vector.
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
qint16x8_t vdupq_n_qs16(qint16x8_t a)
16 bit fixed point vector duplicate (8 elements)
qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector pairwise min (4 elements)
qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector saturating subtraction (4 elements)
qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector pairwise max (8 elements)
qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
8 bit fixed point vector saturating multiply-accumulate long (8 elements).
float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
qint8x8_t vabs_qs8(qint8x8_t a)
Absolute value of 8 bit fixed point vector (8 elements)
qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Calculate hyperbolic tangent for fixed point 8bit (8 elements)
qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
16 bit fixed point vector saturating multiply (8 elements)
qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector saturating subtraction (8 elements)
qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
Division fixed point 8bit (16 elements)
qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
int8x16_t qint8x16_t
8 bit fixed point vector with 16 elements
qint16x4_t vdup_n_qs16(qint16_t a)
16 bit fixed point vector duplicate (4 elements)
qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
16 bit fixed point vector multiply-accumulate (4 elements).
qint8x8_t vqmovn_qs16(qint16x8_t a)
qint8x16_t vdupq_n_qs8(qint8_t a)
8 bit fixed point vector duplicate (16 elements)
qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (8 elemen...
qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
16 bit fixed point vector min (8 elements)
int32x2_t qint32x2_t
32 bit fixed point vector with 2 elements
qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
8 bit fixed point vector max (16 elements)
This file contains all available output stages for GEMMLowp on OpenCL.
qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
8 bit fixed point vector saturating multiply (8 elements)
qint8x8_t vget_low_qs8(qint8x16_t a)
Get the lower half of a 16 elements vector.
int16x4_t vpaddl_qs8(qint8x8_t a)
8 bit fixed point vector saturating pairwise add (8 elements)
float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
Convert a 16 bit fixed point vector with 4 elements to a float vector with 4 elements.
qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
Perform a 4th degree polynomial approximation.
qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
16 bit fixed point vector multiply (4 elements)
qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
16 bit fixed point vector long multiply (4 elements)
qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements.
qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector saturating add (4 elements)
qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
16 bit fixed point vector saturating multiply-accumulate (8 elements).
qint8x8_t vdup_n_qs8(qint8_t a)
8 bit fixed point vector duplicate (8 elements)
qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
8 bit fixed point vector multiply (8 elements)
qint8x8_t vget_high_qs8(qint8x16_t a)
Get the higher half of a 16 elements vector.
qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
8 bit fixed point vector multiply (16 elements)
qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements...
qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector max (8 elements)
qint8x16_t vld1q_qs8(const qint8_t *addr)
Load a single 8 bit fixed point vector from memory (16 elements)
qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements.
qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
Calculate saturating exponential fixed point 16 bit (4 elements)
qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
8 bit fixed point vector saturating add (16 elements)
qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
8 bit fixed point vector subtraction (16 elements)
qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
Perform a 4th degree polynomial approximation.
float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
qint16x8_t vld1q_qs16(const qint16_t *addr)
Load a single 16 bit fixed point vector from memory (8 elements)
qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
8 bit fixed point vector multiply-accumulate long (8 elements).
qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Perform a 4th degree polynomial approximation.
int32_t qint32_t
32 bit fixed point scalar value
qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method.
int16_t qint16_t
16 bit fixed point scalar value
int8x8_t qint8x8_t
8 bit fixed point vector with 8 elements
qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
16 bit fixed point vector max (8 elements)
qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
Load all lanes of 16 bit fixed point vector with same value from memory (8 elements) ...
qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
16 bit fixed point vector saturating multiply-accumulate (4 elements).
qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
Calculate saturating exponential fixed point 8bit (16 elements)
qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
int16x8_t qint16x8_t
16 bit fixed point vector with 8 elements
qint16x4_t vget_low_qs16(qint16x8_t a)
Get the lower half of a 16 elements vector.
qint16x8_t vabsq_qs16(qint16x8_t a)
Absolute value of 16 bit fixed point vector (8 elements)
qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
Calculate saturating exponential fixed point 16 bit (8 elements)
qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector min (8 elements)
qint8x16_t vabsq_qs8(qint8x16_t a)
Absolute value of 8 bit fixed point vector (16 elements)
void vst1q_qs16(qint16_t *addr, qint16x8_t b)
Store a single 16 bit fixed point vector to memory (8 elements)
int8_t qint8_t
8 bit fixed point scalar value
qint8x16_t vqabsq_qs8(qint8x16_t a)
Saturating absolute value of 8 bit fixed point vector (16 elements)
qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
8 bit fixed point vector saturating multiply (16 elements)
qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
16 bit fixed point vector saturating add (8 elements)
qint16x4_t vabs_qs16(qint16x4_t a)
Absolute value of 16 bit fixed point vector (4 elements)
qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector saturating add (8 elements)
qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
8 bit fixed point vector pairwise min (8 elements)
qint16x4_t vqmovn_qs32(qint32x4_t a)
qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
Calculate logarithm fixed point 16 bit (4 elements)
void vst1q_qs8(qint8_t *addr, qint8x16_t b)
Store a single 8 bit fixed point vector to memory (16 elements)
qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
Calculate logarithm fixed point 8 bit (8 elements)
qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector max (4 elements)
qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elemen...
qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
16 bit fixed point vector saturating multiply-accumulate long (4 elements).
qint16x4_t vld1_dup_qs16(const qint16_t *addr)
Load all lanes of 16 bit fixed point vector with same value from memory (4 elements) ...
int32x4_t qint32x4_t
32 bit fixed point vector with 4 elements
qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
Calculate inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements) ...
qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
Calculate hyperbolic tangent for fixed point 16bit (8 elements)
qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
8 bit fixed point vector long multiply (8 elements)
qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 element...
qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
Division fixed point 16 bit (8 elements)
fixed_point< T > max(fixed_point< T > x, fixed_point< T > y)
void vst1_qs16(qint16_t *addr, qint16x4_t b)
Store a single 16 bit fixed point vector to memory (4 elements)
fixed_point< T > tanh(fixed_point< T > x)
qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
8 bit fixed point vector saturating multiply-accumulate (8 elements).
qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
8 bit fixed point vector saturating subtraction (16 elements)
qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
Load all lanes of 8 bit fixed point vector with same value from memory (16 elements) ...
qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector pairwise max (4 elements)
qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
Calculate saturating exponential fixed point 8bit (8 elements)
qint8x8_t vld1_dup_qs8(const qint8_t *addr)
Load all lanes of 8 bit fixed point vector with same value from memory (8 elements) ...
qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
8 bit fixed point vector multiply-accumulate (8 elements).
float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
Compute lane-by-lane maximum between elements of a float vector with 4x2 elements.
qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
16 bit fixed point vector add (8 elements)
qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
16 bit fixed point vector min (4 elements)
qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
Duplicate a float and convert it to 16 bit fixed point vector (8 elements)
qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
Calculate saturating n power for fixed point 8bit (16 elements).
void vst1_qs8(qint8_t *addr, qint8x8_t b)
Store a single 8 bit fixed point vector to memory (8 elements)
qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
Calculate logarithm fixed point 16 bit (8 elements)
qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
16 bit fixed point vector multiply (8 elements)
qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
16 bit fixed point vector multiply-accumulate long (4 elements).
qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
8 bit fixed point vector min (16 elements)
qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method.
qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
8 bit fixed point vector saturating multiply-accumulate (16 elements).
float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements.
qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Calculate hyperbolic tangent for fixed point 8bit (16 elements)
int16x4_t qint16x4_t
16 bit fixed point vector with 4 elements
qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method.
qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) ...