1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
49 #include "opencv2/core/utility.hpp"
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
59 #if defined(__aarch64__)
60 #define CV_SIMD128_64F 1
62 #define CV_SIMD128_64F 0
68 //////////// Utils ////////////
71 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
72 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
73 { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
74 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
75 inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
76 { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
78 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
79 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
80 { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
81 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
82 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
83 { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
87 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
88 template <typename T> static inline \
89 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
90 template <typename T> static inline \
91 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
93 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
96 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
97 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
98 OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
99 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
101 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
102 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
104 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
105 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
107 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
108 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
109 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
110 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
111 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
112 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
113 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
114 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
115 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
117 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
120 //////////// Types ////////////
124 typedef uchar lane_type;
125 enum { nlanes = 16 };
128 explicit v_uint8x16(uint8x16_t v) : val(v) {}
129 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
130 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
132 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
137 return vgetq_lane_u8(val, 0);
145 typedef schar lane_type;
146 enum { nlanes = 16 };
149 explicit v_int8x16(int8x16_t v) : val(v) {}
150 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
151 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
153 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
158 return vgetq_lane_s8(val, 0);
166 typedef ushort lane_type;
170 explicit v_uint16x8(uint16x8_t v) : val(v) {}
171 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
173 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
178 return vgetq_lane_u16(val, 0);
186 typedef short lane_type;
190 explicit v_int16x8(int16x8_t v) : val(v) {}
191 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
193 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
198 return vgetq_lane_s16(val, 0);
206 typedef unsigned lane_type;
210 explicit v_uint32x4(uint32x4_t v) : val(v) {}
211 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
213 unsigned v[] = {v0, v1, v2, v3};
216 unsigned get0() const
218 return vgetq_lane_u32(val, 0);
226 typedef int lane_type;
230 explicit v_int32x4(int32x4_t v) : val(v) {}
231 v_int32x4(int v0, int v1, int v2, int v3)
233 int v[] = {v0, v1, v2, v3};
238 return vgetq_lane_s32(val, 0);
245 typedef float lane_type;
249 explicit v_float32x4(float32x4_t v) : val(v) {}
250 v_float32x4(float v0, float v1, float v2, float v3)
252 float v[] = {v0, v1, v2, v3};
257 return vgetq_lane_f32(val, 0);
264 typedef uint64 lane_type;
268 explicit v_uint64x2(uint64x2_t v) : val(v) {}
269 v_uint64x2(uint64 v0, uint64 v1)
271 uint64 v[] = {v0, v1};
276 return vgetq_lane_u64(val, 0);
283 typedef int64 lane_type;
287 explicit v_int64x2(int64x2_t v) : val(v) {}
288 v_int64x2(int64 v0, int64 v1)
290 int64 v[] = {v0, v1};
295 return vgetq_lane_s64(val, 0);
303 typedef double lane_type;
307 explicit v_float64x2(float64x2_t v) : val(v) {}
308 v_float64x2(double v0, double v1)
310 double v[] = {v0, v1};
315 return vgetq_lane_f64(val, 0);
321 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
322 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
323 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
324 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
325 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
326 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
327 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
328 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
329 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
330 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
331 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
332 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
333 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
335 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
336 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
337 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
338 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
339 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
340 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
341 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
342 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
343 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
345 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
346 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
347 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
348 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
349 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
350 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
351 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
352 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
353 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
354 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
355 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
356 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
357 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
360 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
361 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
363 hreg a1 = mov(a.val), b1 = mov(b.val); \
364 return _Tpvec(vcombine_##suffix(a1, b1)); \
366 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
368 hreg a1 = mov(a.val); \
369 vst1_##suffix(ptr, a1); \
371 template<int n> inline \
372 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
374 hreg a1 = rshr(a.val, n); \
375 hreg b1 = rshr(b.val, n); \
376 return _Tpvec(vcombine_##suffix(a1, b1)); \
378 template<int n> inline \
379 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
381 hreg a1 = rshr(a.val, n); \
382 vst1_##suffix(ptr, a1); \
385 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
386 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
387 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
388 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
389 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
390 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
392 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
393 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
396 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
398 uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
399 return v_uint8x16(ab);
402 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
403 const v_uint32x4& c, const v_uint32x4& d)
405 uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
406 uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
407 return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
410 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
411 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
412 const v_uint64x2& g, const v_uint64x2& h)
414 uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
415 uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
416 uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
417 uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
419 uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
420 uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
421 return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
424 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
425 const v_float32x4& m1, const v_float32x4& m2,
426 const v_float32x4& m3)
428 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
429 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
430 res = vmlaq_lane_f32(res, m1.val, vl, 1);
431 res = vmlaq_lane_f32(res, m2.val, vh, 0);
432 res = vmlaq_lane_f32(res, m3.val, vh, 1);
433 return v_float32x4(res);
436 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
437 const v_float32x4& m1, const v_float32x4& m2,
438 const v_float32x4& a)
440 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
441 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
442 res = vmlaq_lane_f32(res, m1.val, vl, 1);
443 res = vmlaq_lane_f32(res, m2.val, vh, 0);
444 res = vaddq_f32(res, a.val);
445 return v_float32x4(res);
448 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
449 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
451 return _Tpvec(intrin(a.val, b.val)); \
453 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
455 a.val = intrin(a.val, b.val); \
459 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
460 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
461 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
462 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
463 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
464 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
465 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
466 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
467 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
468 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
469 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
470 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
471 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
472 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
473 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
474 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
475 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
476 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
477 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
478 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
479 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
481 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
482 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
483 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
484 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
485 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
487 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
489 float32x4_t reciprocal = vrecpeq_f32(b.val);
490 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
491 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
492 return v_float32x4(vmulq_f32(a.val, reciprocal));
494 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
496 float32x4_t reciprocal = vrecpeq_f32(b.val);
497 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
498 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
499 a.val = vmulq_f32(a.val, reciprocal);
504 // saturating multiply 8-bit, 16-bit
505 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
506 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
509 v_mul_expand(a, b, c, d); \
510 return v_pack(c, d); \
512 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
513 { a = a * b; return a; }
515 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
516 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
517 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4)
518 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
520 // Multiply and expand
521 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
522 v_int16x8& c, v_int16x8& d)
524 c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
525 d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
528 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
529 v_uint16x8& c, v_uint16x8& d)
531 c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
532 d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
535 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
536 v_int32x4& c, v_int32x4& d)
538 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
539 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
542 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
543 v_uint32x4& c, v_uint32x4& d)
545 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
546 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
549 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
550 v_uint64x2& c, v_uint64x2& d)
552 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
553 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
556 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
558 return v_int16x8(vcombine_s16(
559 vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
560 vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
563 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
565 return v_uint16x8(vcombine_u16(
566 vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
567 vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
571 //////// Dot Product ////////
574 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
576 int16x8_t uzp1, uzp2;
577 _v128_unzip(a.val, b.val, uzp1, uzp2);
578 int16x4_t a0 = vget_low_s16(uzp1);
579 int16x4_t b0 = vget_high_s16(uzp1);
580 int16x4_t a1 = vget_low_s16(uzp2);
581 int16x4_t b1 = vget_high_s16(uzp2);
582 int32x4_t p = vmull_s16(a0, b0);
583 return v_int32x4(vmlal_s16(p, a1, b1));
585 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
587 int16x8_t uzp1, uzp2;
588 _v128_unzip(a.val, b.val, uzp1, uzp2);
589 int16x4_t a0 = vget_low_s16(uzp1);
590 int16x4_t b0 = vget_high_s16(uzp1);
591 int16x4_t a1 = vget_low_s16(uzp2);
592 int16x4_t b1 = vget_high_s16(uzp2);
593 int32x4_t p = vmlal_s16(c.val, a0, b0);
594 return v_int32x4(vmlal_s16(p, a1, b1));
598 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
600 int32x4_t uzp1, uzp2;
601 _v128_unzip(a.val, b.val, uzp1, uzp2);
602 int32x2_t a0 = vget_low_s32(uzp1);
603 int32x2_t b0 = vget_high_s32(uzp1);
604 int32x2_t a1 = vget_low_s32(uzp2);
605 int32x2_t b1 = vget_high_s32(uzp2);
606 int64x2_t p = vmull_s32(a0, b0);
607 return v_int64x2(vmlal_s32(p, a1, b1));
609 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
611 int32x4_t uzp1, uzp2;
612 _v128_unzip(a.val, b.val, uzp1, uzp2);
613 int32x2_t a0 = vget_low_s32(uzp1);
614 int32x2_t b0 = vget_high_s32(uzp1);
615 int32x2_t a1 = vget_low_s32(uzp2);
616 int32x2_t b1 = vget_high_s32(uzp2);
617 int64x2_t p = vmlal_s32(c.val, a0, b0);
618 return v_int64x2(vmlal_s32(p, a1, b1));
622 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
625 return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
627 const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
628 const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
629 const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
630 const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
632 uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
633 vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
634 uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
635 vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
637 uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
638 vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
639 uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
640 vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
641 return v_uint32x4(vaddq_u32(s0, s1));
644 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
648 return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
650 return v_dotprod_expand(a, b) + c;
654 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
657 return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
659 int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
660 int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
661 int16x8_t uzp1, uzp2;
662 _v128_unzip(p0, p1, uzp1, uzp2);
663 int16x8_t sum = vaddq_s16(uzp1, uzp2);
664 int16x4_t uzpl1, uzpl2;
665 _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
666 return v_int32x4(vaddl_s16(uzpl1, uzpl2));
669 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
673 return v_int32x4(vdotq_s32(c.val, a.val, b.val));
675 return v_dotprod_expand(a, b) + c;
680 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
682 const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
683 const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
685 uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
686 vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
687 uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
688 vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
689 uint32x4_t uzp1, uzp2;
690 _v128_unzip(even, odd, uzp1, uzp2);
691 uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
692 uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
693 return v_uint64x2(vaddq_u64(s0, s1));
695 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
696 { return v_dotprod_expand(a, b) + c; }
698 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
700 int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
701 int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
703 int32x4_t uzp1, uzp2;
704 _v128_unzip(p0, p1, uzp1, uzp2);
705 int32x4_t sum = vaddq_s32(uzp1, uzp2);
707 int32x2_t uzpl1, uzpl2;
708 _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
709 return v_int64x2(vaddl_s32(uzpl1, uzpl2));
711 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
713 { return v_dotprod_expand(a, b) + c; }
717 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
718 { return v_cvt_f64(v_dotprod(a, b)); }
719 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
720 const v_float64x2& c)
721 { return v_dotprod_expand(a, b) + c; }
724 //////// Fast Dot Product ////////
727 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
729 int16x4_t a0 = vget_low_s16(a.val);
730 int16x4_t a1 = vget_high_s16(a.val);
731 int16x4_t b0 = vget_low_s16(b.val);
732 int16x4_t b1 = vget_high_s16(b.val);
733 int32x4_t p = vmull_s16(a0, b0);
734 return v_int32x4(vmlal_s16(p, a1, b1));
736 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
738 int16x4_t a0 = vget_low_s16(a.val);
739 int16x4_t a1 = vget_high_s16(a.val);
740 int16x4_t b0 = vget_low_s16(b.val);
741 int16x4_t b1 = vget_high_s16(b.val);
742 int32x4_t p = vmlal_s16(c.val, a0, b0);
743 return v_int32x4(vmlal_s16(p, a1, b1));
747 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
749 int32x2_t a0 = vget_low_s32(a.val);
750 int32x2_t a1 = vget_high_s32(a.val);
751 int32x2_t b0 = vget_low_s32(b.val);
752 int32x2_t b1 = vget_high_s32(b.val);
753 int64x2_t p = vmull_s32(a0, b0);
754 return v_int64x2(vmlal_s32(p, a1, b1));
756 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
758 int32x2_t a0 = vget_low_s32(a.val);
759 int32x2_t a1 = vget_high_s32(a.val);
760 int32x2_t b0 = vget_low_s32(b.val);
761 int32x2_t b1 = vget_high_s32(b.val);
762 int64x2_t p = vmlal_s32(c.val, a0, b0);
763 return v_int64x2(vmlal_s32(p, a1, b1));
767 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
770 return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
772 uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
773 uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
774 uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
775 uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
776 return v_uint32x4(vaddq_u32(s0, s1));
779 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
782 return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
784 return v_dotprod_expand_fast(a, b) + c;
788 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
791 return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
793 int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
794 prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
795 return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
798 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
801 return v_int32x4(vdotq_s32(c.val, a.val, b.val));
803 return v_dotprod_expand_fast(a, b) + c;
808 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
810 uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
811 uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
812 uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
813 uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
814 return v_uint64x2(vaddq_u64(s0, s1));
816 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
817 { return v_dotprod_expand_fast(a, b) + c; }
819 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
821 int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
822 prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
823 return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
825 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
826 { return v_dotprod_expand_fast(a, b) + c; }
830 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
831 { return v_cvt_f64(v_dotprod_fast(a, b)); }
832 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
833 { return v_dotprod_expand_fast(a, b) + c; }
837 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
838 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
839 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
840 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
841 inline _Tpvec operator ~ (const _Tpvec& a) \
843 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
846 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
847 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
848 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
849 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
850 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
851 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
852 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
853 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
855 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
856 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
858 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
860 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
862 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
866 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
867 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
868 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
870 inline v_float32x4 operator ~ (const v_float32x4& a)
872 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
876 inline v_float32x4 v_sqrt(const v_float32x4& x)
878 return v_float32x4(vsqrtq_f32(x.val));
881 inline v_float32x4 v_invsqrt(const v_float32x4& x)
883 v_float32x4 one = v_setall_f32(1.0f);
884 return one / v_sqrt(x);
887 inline v_float32x4 v_sqrt(const v_float32x4& x)
889 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
890 float32x4_t e = vrsqrteq_f32(x1);
891 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
892 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
893 return v_float32x4(vmulq_f32(x.val, e));
896 inline v_float32x4 v_invsqrt(const v_float32x4& x)
898 float32x4_t e = vrsqrteq_f32(x.val);
899 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
900 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
901 return v_float32x4(e);
905 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
906 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
908 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
909 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
910 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
912 inline v_float32x4 v_abs(v_float32x4 x)
913 { return v_float32x4(vabsq_f32(x.val)); }
916 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
917 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
919 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
921 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
923 a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
927 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
928 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
929 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
931 inline v_float64x2 operator ~ (const v_float64x2& a)
933 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
936 inline v_float64x2 v_sqrt(const v_float64x2& x)
938 return v_float64x2(vsqrtq_f64(x.val));
941 inline v_float64x2 v_invsqrt(const v_float64x2& x)
943 v_float64x2 one = v_setall_f64(1.0f);
944 return one / v_sqrt(x);
947 inline v_float64x2 v_abs(v_float64x2 x)
948 { return v_float64x2(vabsq_f64(x.val)); }
951 // TODO: exp, log, sin, cos
953 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
954 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
956 return _Tpvec(intrin(a.val, b.val)); \
959 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
960 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
961 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
962 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
963 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
964 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
965 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
966 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
967 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
968 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
969 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
970 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
971 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
972 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
974 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
975 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
979 inline int64x2_t vmvnq_s64(int64x2_t a)
981 int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
982 return veorq_s64(a, vx);
984 inline uint64x2_t vmvnq_u64(uint64x2_t a)
986 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
987 return veorq_u64(a, vx);
990 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
991 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
992 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
993 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
994 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
995 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
996 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
997 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
998 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
999 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1000 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1001 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1002 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1004 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1005 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
1006 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1007 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
1008 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1009 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
1010 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
1012 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1013 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
1014 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
1017 inline v_float32x4 v_not_nan(const v_float32x4& a)
1018 { return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1020 inline v_float64x2 v_not_nan(const v_float64x2& a)
1021 { return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1024 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
1025 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
1026 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
1027 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
1028 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
1029 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
1030 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
1031 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
1032 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
1033 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
1034 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
1035 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
1037 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
1038 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
1039 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
1040 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
1042 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
1045 /** Saturating absolute difference **/
1046 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1047 { return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
1048 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1049 { return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1051 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1052 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1054 return _Tpvec2(cast(intrin(a.val, b.val))); \
1057 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
1058 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
1059 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
1061 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1063 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1067 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1069 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1072 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1075 // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
1076 // also adds FMA support both for single- and double-precision floating-point vectors
1077 return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1079 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1083 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1085 return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1088 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1090 return v_fma(a, b, c);
1093 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1095 return v_fma(a, b, c);
1099 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1101 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1105 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1107 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1110 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1112 return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1115 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1117 return v_fma(a, b, c);
1121 // trade efficiency for convenience
1122 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1123 inline _Tpvec operator << (const _Tpvec& a, int n) \
1124 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1125 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1126 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1127 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1128 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1129 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1130 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1131 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1132 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1134 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
1135 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
1136 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
1137 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
1138 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
1139 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
1140 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
1141 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
1143 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1144 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1145 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1146 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1147 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
1148 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1150 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1151 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1152 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1153 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
1154 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1155 { CV_UNUSED(b); return a; }
1157 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
1158 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
1159 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
1160 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
1161 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
1162 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
1163 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
1164 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
1165 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
1167 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
1170 #if defined(__clang__) && defined(__aarch64__)
1171 // avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
1172 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1173 inline _Tpvec v_load_low(const _Tp* ptr) \
1175 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1176 uint64 v = *(unaligned_uint64*)ptr; \
1177 return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1180 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1181 inline _Tpvec v_load_low(const _Tp* ptr) \
1182 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1185 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1186 inline _Tpvec v_load(const _Tp* ptr) \
1187 { return _Tpvec(vld1q_##suffix(ptr)); } \
1188 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1189 { return _Tpvec(vld1q_##suffix(ptr)); } \
1190 OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1191 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1192 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1193 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1194 { vst1q_##suffix(ptr, a.val); } \
1195 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1196 { vst1q_##suffix(ptr, a.val); } \
1197 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1198 { vst1q_##suffix(ptr, a.val); } \
1199 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1200 { vst1q_##suffix(ptr, a.val); } \
1201 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1202 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1203 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1204 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1206 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
1207 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
1208 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
1209 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
1210 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
1211 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
1212 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
1213 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
1214 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
1216 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
1219 inline unsigned v_reduce_sum(const v_uint8x16& a)
1221 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1222 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1223 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1225 inline int v_reduce_sum(const v_int8x16& a)
1227 int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1228 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1229 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1231 inline unsigned v_reduce_sum(const v_uint16x8& a)
1233 uint32x4_t t0 = vpaddlq_u16(a.val);
1234 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1235 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1237 inline int v_reduce_sum(const v_int16x8& a)
1239 int32x4_t t0 = vpaddlq_s16(a.val);
1240 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1241 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1244 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1245 inline scalartype v_reduce_##func(const _Tpvec& a) \
1247 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1248 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1249 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1252 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16)
1253 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16)
1254 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16)
1255 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16)
1257 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1258 inline scalartype v_reduce_##func(const _Tpvec& a) \
1260 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1261 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1264 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
1265 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
1266 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
1267 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
1268 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
1269 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
1270 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
1271 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
1272 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
1274 inline uint64 v_reduce_sum(const v_uint64x2& a)
1275 { return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
1276 inline int64 v_reduce_sum(const v_int64x2& a)
1277 { return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
1279 inline double v_reduce_sum(const v_float64x2& a)
1281 return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
1285 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1286 const v_float32x4& c, const v_float32x4& d)
1288 float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1289 float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1291 float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
1292 float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
1294 float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1295 float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1297 return v_float32x4(vaddq_f32(v0, v1));
1300 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1302 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1303 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1304 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1306 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1308 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1309 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1310 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1312 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1314 uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1315 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1316 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1318 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1320 uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1321 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1322 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1324 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1326 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1327 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1328 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1330 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1332 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1333 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1334 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1336 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1338 float32x4_t t0 = vabdq_f32(a.val, b.val);
1339 float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1340 return vget_lane_f32(vpadd_f32(t1, t1), 0);
1343 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1344 { return v_uint8x16(vcntq_u8(a.val)); }
1345 inline v_uint8x16 v_popcount(const v_int8x16& a)
1346 { return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
1347 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1348 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
1349 inline v_uint16x8 v_popcount(const v_int16x8& a)
1350 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
1351 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1352 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
1353 inline v_uint32x4 v_popcount(const v_int32x4& a)
1354 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
1355 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1356 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
1357 inline v_uint64x2 v_popcount(const v_int64x2& a)
1358 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1360 inline int v_signmask(const v_uint8x16& a)
1362 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1363 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1364 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1365 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1367 inline int v_signmask(const v_int8x16& a)
1368 { return v_signmask(v_reinterpret_as_u8(a)); }
1370 inline int v_signmask(const v_uint16x8& a)
1372 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1373 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1374 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1375 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1377 inline int v_signmask(const v_int16x8& a)
1378 { return v_signmask(v_reinterpret_as_u16(a)); }
1380 inline int v_signmask(const v_uint32x4& a)
1382 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1383 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1384 uint64x2_t v1 = vpaddlq_u32(v0);
1385 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1387 inline int v_signmask(const v_int32x4& a)
1388 { return v_signmask(v_reinterpret_as_u32(a)); }
1389 inline int v_signmask(const v_float32x4& a)
1390 { return v_signmask(v_reinterpret_as_u32(a)); }
1391 inline int v_signmask(const v_uint64x2& a)
1393 int64x1_t m0 = vdup_n_s64(0);
1394 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1395 return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1397 inline int v_signmask(const v_int64x2& a)
1398 { return v_signmask(v_reinterpret_as_u64(a)); }
1400 inline int v_signmask(const v_float64x2& a)
1401 { return v_signmask(v_reinterpret_as_u64(a)); }
1404 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1405 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1406 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1407 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1408 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1409 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1410 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1411 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1412 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1414 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1417 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1418 inline bool v_check_all(const v_##_Tpvec& a) \
1420 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1421 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1422 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1424 inline bool v_check_any(const v_##_Tpvec& a) \
1426 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1427 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1428 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1431 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1432 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1433 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1435 inline bool v_check_all(const v_uint64x2& a)
1437 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1438 return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1440 inline bool v_check_any(const v_uint64x2& a)
1442 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1443 return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1446 inline bool v_check_all(const v_int8x16& a)
1447 { return v_check_all(v_reinterpret_as_u8(a)); }
1448 inline bool v_check_all(const v_int16x8& a)
1449 { return v_check_all(v_reinterpret_as_u16(a)); }
1450 inline bool v_check_all(const v_int32x4& a)
1451 { return v_check_all(v_reinterpret_as_u32(a)); }
1452 inline bool v_check_all(const v_float32x4& a)
1453 { return v_check_all(v_reinterpret_as_u32(a)); }
1455 inline bool v_check_any(const v_int8x16& a)
1456 { return v_check_any(v_reinterpret_as_u8(a)); }
1457 inline bool v_check_any(const v_int16x8& a)
1458 { return v_check_any(v_reinterpret_as_u16(a)); }
1459 inline bool v_check_any(const v_int32x4& a)
1460 { return v_check_any(v_reinterpret_as_u32(a)); }
1461 inline bool v_check_any(const v_float32x4& a)
1462 { return v_check_any(v_reinterpret_as_u32(a)); }
1464 inline bool v_check_all(const v_int64x2& a)
1465 { return v_check_all(v_reinterpret_as_u64(a)); }
1466 inline bool v_check_any(const v_int64x2& a)
1467 { return v_check_any(v_reinterpret_as_u64(a)); }
1469 inline bool v_check_all(const v_float64x2& a)
1470 { return v_check_all(v_reinterpret_as_u64(a)); }
1471 inline bool v_check_any(const v_float64x2& a)
1472 { return v_check_any(v_reinterpret_as_u64(a)); }
1475 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1476 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1478 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1481 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1482 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1483 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1484 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1485 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1486 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1487 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1489 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1492 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1493 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1495 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1496 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1498 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1500 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1502 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1504 return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1506 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1508 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1511 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1512 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1513 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1514 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1515 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1516 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1518 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1520 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1521 uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1522 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1523 return v_uint32x4(vmovl_u16(v1));
1526 inline v_int32x4 v_load_expand_q(const schar* ptr)
1528 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1529 int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1530 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1531 return v_int32x4(vmovl_s16(v1));
1534 #if defined(__aarch64__)
1535 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1536 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1538 b0.val = vzip1q_##suffix(a0.val, a1.val); \
1539 b1.val = vzip2q_##suffix(a0.val, a1.val); \
1541 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1543 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1545 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1547 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1549 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1551 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1552 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1555 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1556 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1558 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1559 b0.val = p.val[0]; \
1560 b1.val = p.val[1]; \
1562 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1564 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1566 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1568 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1570 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1572 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1573 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1577 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1578 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1579 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1580 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1581 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1582 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1583 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1585 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1588 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1590 uint8x16_t vec = vrev64q_u8(a.val);
1591 return v_uint8x16(vextq_u8(vec, vec, 8));
1594 inline v_int8x16 v_reverse(const v_int8x16 &a)
1595 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1597 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1599 uint16x8_t vec = vrev64q_u16(a.val);
1600 return v_uint16x8(vextq_u16(vec, vec, 4));
1603 inline v_int16x8 v_reverse(const v_int16x8 &a)
1604 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1606 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1608 uint32x4_t vec = vrev64q_u32(a.val);
1609 return v_uint32x4(vextq_u32(vec, vec, 2));
1612 inline v_int32x4 v_reverse(const v_int32x4 &a)
1613 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1615 inline v_float32x4 v_reverse(const v_float32x4 &a)
1616 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1618 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1620 uint64x2_t vec = a.val;
1621 uint64x1_t vec_lo = vget_low_u64(vec);
1622 uint64x1_t vec_hi = vget_high_u64(vec);
1623 return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1626 inline v_int64x2 v_reverse(const v_int64x2 &a)
1627 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1630 inline v_float64x2 v_reverse(const v_float64x2 &a)
1631 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1634 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1636 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1638 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1641 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1642 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1643 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1644 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1645 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1646 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1647 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1648 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1649 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1651 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1654 #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1655 template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1657 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
1658 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
1659 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
1660 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
1661 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
1662 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
1663 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
1664 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
1665 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
1667 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
1670 #define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1671 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1673 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
1674 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
1675 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
1676 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
1677 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
1678 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
1679 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
1680 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
1681 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
1683 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
1687 inline v_int32x4 v_round(const v_float32x4& a)
1689 float32x4_t a_ = a.val;
1691 __asm__ ("fcvtns %0.4s, %1.4s"
1694 : /* No clobbers */);
1695 return v_int32x4(result);
1698 inline v_int32x4 v_round(const v_float32x4& a)
1700 static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1701 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1703 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1704 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1707 inline v_int32x4 v_floor(const v_float32x4& a)
1709 int32x4_t a1 = vcvtq_s32_f32(a.val);
1710 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1711 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1714 inline v_int32x4 v_ceil(const v_float32x4& a)
1716 int32x4_t a1 = vcvtq_s32_f32(a.val);
1717 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1718 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1721 inline v_int32x4 v_trunc(const v_float32x4& a)
1722 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1725 inline v_int32x4 v_round(const v_float64x2& a)
1727 static const int32x2_t zero = vdup_n_s32(0);
1728 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1731 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1733 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
1736 inline v_int32x4 v_floor(const v_float64x2& a)
1738 static const int32x2_t zero = vdup_n_s32(0);
1739 int64x2_t a1 = vcvtq_s64_f64(a.val);
1740 uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1741 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1742 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1745 inline v_int32x4 v_ceil(const v_float64x2& a)
1747 static const int32x2_t zero = vdup_n_s32(0);
1748 int64x2_t a1 = vcvtq_s64_f64(a.val);
1749 uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1750 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
1751 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1754 inline v_int32x4 v_trunc(const v_float64x2& a)
1756 static const int32x2_t zero = vdup_n_s32(0);
1757 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1761 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
1762 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1763 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1764 v_##_Tpvec& b0, v_##_Tpvec& b1, \
1765 v_##_Tpvec& b2, v_##_Tpvec& b3) \
1767 /* m00 m01 m02 m03 */ \
1768 /* m10 m11 m12 m13 */ \
1769 /* m20 m21 m22 m23 */ \
1770 /* m30 m31 m32 m33 */ \
1771 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
1772 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
1773 /* m00 m10 m02 m12 */ \
1774 /* m01 m11 m03 m13 */ \
1775 /* m20 m30 m22 m32 */ \
1776 /* m21 m31 m23 m33 */ \
1777 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
1778 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
1779 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
1780 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
1783 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
1784 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
1785 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
1787 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
1788 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1790 _Tpvec##x2_t v = vld2q_##suffix(ptr); \
1794 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1796 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
1801 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1802 v_##_Tpvec& c, v_##_Tpvec& d) \
1804 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
1810 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1811 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1816 vst2q_##suffix(ptr, v); \
1818 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1819 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1825 vst3q_##suffix(ptr, v); \
1827 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1828 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1829 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1836 vst4q_##suffix(ptr, v); \
1839 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
1840 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
1842 tp##x1_t a0 = vld1_##suffix(ptr); \
1843 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1844 tp##x1_t a1 = vld1_##suffix(ptr + 2); \
1845 tp##x1_t b1 = vld1_##suffix(ptr + 3); \
1846 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1847 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1850 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
1851 v_##tp##x2& b, v_##tp##x2& c ) \
1853 tp##x1_t a0 = vld1_##suffix(ptr); \
1854 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1855 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1856 tp##x1_t a1 = vld1_##suffix(ptr + 3); \
1857 tp##x1_t b1 = vld1_##suffix(ptr + 4); \
1858 tp##x1_t c1 = vld1_##suffix(ptr + 5); \
1859 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1860 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1861 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1864 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
1865 v_##tp##x2& c, v_##tp##x2& d ) \
1867 tp##x1_t a0 = vld1_##suffix(ptr); \
1868 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1869 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1870 tp##x1_t d0 = vld1_##suffix(ptr + 3); \
1871 tp##x1_t a1 = vld1_##suffix(ptr + 4); \
1872 tp##x1_t b1 = vld1_##suffix(ptr + 5); \
1873 tp##x1_t c1 = vld1_##suffix(ptr + 6); \
1874 tp##x1_t d1 = vld1_##suffix(ptr + 7); \
1875 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1876 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1877 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1878 d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
1881 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1882 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1884 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1885 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1886 vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
1887 vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
1890 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
1891 const v_##tp##x2& b, const v_##tp##x2& c, \
1892 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1894 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1895 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1896 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1897 vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
1898 vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
1899 vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
1902 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1903 const v_##tp##x2& c, const v_##tp##x2& d, \
1904 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1906 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1907 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1908 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1909 vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
1910 vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
1911 vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
1912 vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
1913 vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
1916 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
1917 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
1918 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
1919 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
1920 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
1921 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
1922 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
1924 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
1927 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
1928 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
1930 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1932 return v_float32x4(vcvtq_f32_s32(a.val));
1936 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1938 float32x2_t zero = vdup_n_f32(0.0f);
1939 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
1942 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1944 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
1947 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1949 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
1952 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1954 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
1957 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1959 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
1962 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1964 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
1967 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1968 { return v_float64x2(vcvtq_f64_s64(a.val)); }
1972 ////////////// Lookup table access ////////////////////
1974 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1976 schar CV_DECL_ALIGNED(32) elems[16] =
1995 return v_int8x16(vld1q_s8(elems));
1997 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1999 schar CV_DECL_ALIGNED(32) elems[16] =
2018 return v_int8x16(vld1q_s8(elems));
2020 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2022 schar CV_DECL_ALIGNED(32) elems[16] =
2041 return v_int8x16(vld1q_s8(elems));
2043 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
2044 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
2045 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2047 inline v_int16x8 v_lut(const short* tab, const int* idx)
2049 short CV_DECL_ALIGNED(32) elems[8] =
2060 return v_int16x8(vld1q_s16(elems));
2062 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2064 short CV_DECL_ALIGNED(32) elems[8] =
2075 return v_int16x8(vld1q_s16(elems));
2077 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2079 return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2081 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
2082 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
2083 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
2085 inline v_int32x4 v_lut(const int* tab, const int* idx)
2087 int CV_DECL_ALIGNED(32) elems[4] =
2094 return v_int32x4(vld1q_s32(elems));
2096 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2098 return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2100 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2102 return v_int32x4(vld1q_s32(tab + idx[0]));
2104 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
2105 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
2106 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
2108 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2110 return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2112 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2114 return v_int64x2(vld1q_s64(tab + idx[0]));
2116 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2117 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2119 inline v_float32x4 v_lut(const float* tab, const int* idx)
2121 float CV_DECL_ALIGNED(32) elems[4] =
2128 return v_float32x4(vld1q_f32(elems));
2130 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
2132 uint64 CV_DECL_ALIGNED(32) elems[2] =
2134 *(uint64*)(tab + idx[0]),
2135 *(uint64*)(tab + idx[1])
2137 return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2139 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
2141 return v_float32x4(vld1q_f32(tab + idx[0]));
2144 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2146 int CV_DECL_ALIGNED(32) elems[4] =
2148 tab[vgetq_lane_s32(idxvec.val, 0)],
2149 tab[vgetq_lane_s32(idxvec.val, 1)],
2150 tab[vgetq_lane_s32(idxvec.val, 2)],
2151 tab[vgetq_lane_s32(idxvec.val, 3)]
2153 return v_int32x4(vld1q_s32(elems));
2156 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2158 unsigned CV_DECL_ALIGNED(32) elems[4] =
2160 tab[vgetq_lane_s32(idxvec.val, 0)],
2161 tab[vgetq_lane_s32(idxvec.val, 1)],
2162 tab[vgetq_lane_s32(idxvec.val, 2)],
2163 tab[vgetq_lane_s32(idxvec.val, 3)]
2165 return v_uint32x4(vld1q_u32(elems));
2168 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2170 float CV_DECL_ALIGNED(32) elems[4] =
2172 tab[vgetq_lane_s32(idxvec.val, 0)],
2173 tab[vgetq_lane_s32(idxvec.val, 1)],
2174 tab[vgetq_lane_s32(idxvec.val, 2)],
2175 tab[vgetq_lane_s32(idxvec.val, 3)]
2177 return v_float32x4(vld1q_f32(elems));
2180 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2182 /*int CV_DECL_ALIGNED(32) idx[4];
2183 v_store(idx, idxvec);
2185 float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
2186 float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
2188 float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
2189 x = v_float32x4(xxyy.val[0]);
2190 y = v_float32x4(xxyy.val[1]);*/
2191 int CV_DECL_ALIGNED(32) idx[4];
2192 v_store_aligned(idx, idxvec);
2194 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2195 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2198 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2200 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2202 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2203 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2205 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2207 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2209 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2211 return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2213 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2214 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2216 int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2217 return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2219 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2221 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2223 int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2224 return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2226 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2227 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2229 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2231 return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2233 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2235 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2237 return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2239 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2241 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2242 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2243 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2246 inline v_float64x2 v_lut(const double* tab, const int* idx)
2248 double CV_DECL_ALIGNED(32) elems[2] =
2253 return v_float64x2(vld1q_f64(elems));
2256 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2258 return v_float64x2(vld1q_f64(tab + idx[0]));
2261 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2263 double CV_DECL_ALIGNED(32) elems[2] =
2265 tab[vgetq_lane_s32(idxvec.val, 0)],
2266 tab[vgetq_lane_s32(idxvec.val, 1)],
2268 return v_float64x2(vld1q_f64(elems));
2271 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2273 int CV_DECL_ALIGNED(32) idx[4];
2274 v_store_aligned(idx, idxvec);
2276 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
2277 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
2281 ////// FP16 support ///////
2283 inline v_float32x4 v_load_expand(const float16_t* ptr)
2286 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
2287 (float16x4_t)vld1_s16((const short*)ptr);
2289 vld1_f16((const __fp16*)ptr);
2291 return v_float32x4(vcvt_f32_f16(v));
2294 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2296 float16x4_t hv = vcvt_f16_f32(v.val);
2298 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
2299 vst1_s16((short*)ptr, (int16x4_t)hv);
2301 vst1_f16((__fp16*)ptr, hv);
2305 inline v_float32x4 v_load_expand(const float16_t* ptr)
2309 for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2313 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2318 for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2322 inline void v_cleanup() {}
2324 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END