1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
49 #include "opencv2/core/utility.hpp"
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
59 #if defined(__aarch64__)
60 #define CV_SIMD128_64F 1
62 #define CV_SIMD128_64F 0
66 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
67 template <typename T> static inline \
68 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
69 template <typename T> static inline \
70 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
71 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
72 OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
73 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
74 OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
75 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
76 OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
77 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
78 OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
79 OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
84 typedef uchar lane_type;
88 explicit v_uint8x16(uint8x16_t v) : val(v) {}
89 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
90 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
92 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
97 return vgetq_lane_u8(val, 0);
105 typedef schar lane_type;
106 enum { nlanes = 16 };
109 explicit v_int8x16(int8x16_t v) : val(v) {}
110 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
111 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
113 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
118 return vgetq_lane_s8(val, 0);
126 typedef ushort lane_type;
130 explicit v_uint16x8(uint16x8_t v) : val(v) {}
131 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
133 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
138 return vgetq_lane_u16(val, 0);
146 typedef short lane_type;
150 explicit v_int16x8(int16x8_t v) : val(v) {}
151 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
153 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
158 return vgetq_lane_s16(val, 0);
166 typedef unsigned lane_type;
170 explicit v_uint32x4(uint32x4_t v) : val(v) {}
171 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
173 unsigned v[] = {v0, v1, v2, v3};
176 unsigned get0() const
178 return vgetq_lane_u32(val, 0);
186 typedef int lane_type;
190 explicit v_int32x4(int32x4_t v) : val(v) {}
191 v_int32x4(int v0, int v1, int v2, int v3)
193 int v[] = {v0, v1, v2, v3};
198 return vgetq_lane_s32(val, 0);
205 typedef float lane_type;
209 explicit v_float32x4(float32x4_t v) : val(v) {}
210 v_float32x4(float v0, float v1, float v2, float v3)
212 float v[] = {v0, v1, v2, v3};
217 return vgetq_lane_f32(val, 0);
224 typedef uint64 lane_type;
228 explicit v_uint64x2(uint64x2_t v) : val(v) {}
229 v_uint64x2(uint64 v0, uint64 v1)
231 uint64 v[] = {v0, v1};
236 return vgetq_lane_u64(val, 0);
243 typedef int64 lane_type;
247 explicit v_int64x2(int64x2_t v) : val(v) {}
248 v_int64x2(int64 v0, int64 v1)
250 int64 v[] = {v0, v1};
255 return vgetq_lane_s64(val, 0);
263 typedef double lane_type;
267 explicit v_float64x2(float64x2_t v) : val(v) {}
268 v_float64x2(double v0, double v1)
270 double v[] = {v0, v1};
275 return vgetq_lane_f64(val, 0);
282 // Workaround for old compilers
283 static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
284 static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
285 static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
286 static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
288 static inline float16x8_t cv_vld1q_f16(const void* ptr)
290 #ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
291 return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
293 return vld1q_f16((const __fp16*)ptr);
296 static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
298 #ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
299 vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
301 vst1q_f16((__fp16*)ptr, a);
305 static inline float16x4_t cv_vld1_f16(const void* ptr)
307 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
308 return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
310 return vld1_f16((const __fp16*)ptr);
313 static inline void cv_vst1_f16(void* ptr, float16x4_t a)
315 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
316 vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
318 vst1_f16((__fp16*)ptr, a);
325 typedef short lane_type;
329 explicit v_float16x8(float16x8_t v) : val(v) {}
330 v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
332 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
333 val = cv_vld1q_f16(v);
337 return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
342 inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
343 inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
346 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
347 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
348 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
349 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
350 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
351 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
352 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
353 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
354 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
355 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
356 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
357 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
358 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
360 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
361 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
362 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
363 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
364 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
365 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
366 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
367 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
368 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
370 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
371 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
372 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
373 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
374 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
375 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
376 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
377 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
378 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
379 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
380 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
381 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
382 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
385 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
386 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
388 hreg a1 = mov(a.val), b1 = mov(b.val); \
389 return _Tpvec(vcombine_##suffix(a1, b1)); \
391 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
393 hreg a1 = mov(a.val); \
394 vst1_##suffix(ptr, a1); \
396 template<int n> inline \
397 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
399 hreg a1 = rshr(a.val, n); \
400 hreg b1 = rshr(b.val, n); \
401 return _Tpvec(vcombine_##suffix(a1, b1)); \
403 template<int n> inline \
404 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
406 hreg a1 = rshr(a.val, n); \
407 vst1_##suffix(ptr, a1); \
410 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
411 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
412 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
413 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
414 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
415 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
417 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
418 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
420 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
421 const v_float32x4& m1, const v_float32x4& m2,
422 const v_float32x4& m3)
424 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
425 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
426 res = vmlaq_lane_f32(res, m1.val, vl, 1);
427 res = vmlaq_lane_f32(res, m2.val, vh, 0);
428 res = vmlaq_lane_f32(res, m3.val, vh, 1);
429 return v_float32x4(res);
432 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
433 const v_float32x4& m1, const v_float32x4& m2,
434 const v_float32x4& a)
436 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
437 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
438 res = vmlaq_lane_f32(res, m1.val, vl, 1);
439 res = vmlaq_lane_f32(res, m2.val, vh, 0);
440 res = vaddq_f32(res, a.val);
441 return v_float32x4(res);
444 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
445 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
447 return _Tpvec(intrin(a.val, b.val)); \
449 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
451 a.val = intrin(a.val, b.val); \
455 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
456 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
457 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
458 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
459 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
460 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
461 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
462 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
463 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
464 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
465 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
466 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
467 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
468 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
469 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
470 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
471 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
472 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
473 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
474 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
475 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
476 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
477 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
479 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
480 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
481 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
482 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
483 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
485 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
487 float32x4_t reciprocal = vrecpeq_f32(b.val);
488 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
489 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
490 return v_float32x4(vmulq_f32(a.val, reciprocal));
492 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
494 float32x4_t reciprocal = vrecpeq_f32(b.val);
495 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
496 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
497 a.val = vmulq_f32(a.val, reciprocal);
502 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
503 v_int32x4& c, v_int32x4& d)
505 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
506 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
509 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
510 v_uint32x4& c, v_uint32x4& d)
512 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
513 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
516 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
517 v_uint64x2& c, v_uint64x2& d)
519 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
520 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
523 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
525 int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
526 int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
527 int32x4x2_t cd = vuzpq_s32(c, d);
528 return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
531 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
533 v_int32x4 s = v_dotprod(a, b);
534 return v_int32x4(vaddq_s32(s.val , c.val));
537 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
538 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
539 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
540 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
541 inline _Tpvec operator ~ (const _Tpvec& a) \
543 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
546 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
547 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
548 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
549 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
550 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
551 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
552 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
553 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
555 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
556 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
558 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
560 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
562 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
566 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
567 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
568 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
570 inline v_float32x4 operator ~ (const v_float32x4& a)
572 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
576 inline v_float32x4 v_sqrt(const v_float32x4& x)
578 return v_float32x4(vsqrtq_f32(x.val));
581 inline v_float32x4 v_invsqrt(const v_float32x4& x)
583 v_float32x4 one = v_setall_f32(1.0f);
584 return one / v_sqrt(x);
587 inline v_float32x4 v_sqrt(const v_float32x4& x)
589 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
590 float32x4_t e = vrsqrteq_f32(x1);
591 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
592 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
593 return v_float32x4(vmulq_f32(x.val, e));
596 inline v_float32x4 v_invsqrt(const v_float32x4& x)
598 float32x4_t e = vrsqrteq_f32(x.val);
599 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
600 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
601 return v_float32x4(e);
605 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
606 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
608 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
609 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
610 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
612 inline v_float32x4 v_abs(v_float32x4 x)
613 { return v_float32x4(vabsq_f32(x.val)); }
616 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
617 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
619 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
621 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
623 a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
627 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
628 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
629 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
631 inline v_float64x2 operator ~ (const v_float64x2& a)
633 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
636 inline v_float64x2 v_sqrt(const v_float64x2& x)
638 return v_float64x2(vsqrtq_f64(x.val));
641 inline v_float64x2 v_invsqrt(const v_float64x2& x)
643 v_float64x2 one = v_setall_f64(1.0f);
644 return one / v_sqrt(x);
647 inline v_float64x2 v_abs(v_float64x2 x)
648 { return v_float64x2(vabsq_f64(x.val)); }
651 // TODO: exp, log, sin, cos
653 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
654 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
656 return _Tpvec(intrin(a.val, b.val)); \
659 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
660 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
661 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
662 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
663 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
664 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
665 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
666 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
667 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
668 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
669 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
670 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
671 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
672 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
674 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
675 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
679 inline int64x2_t vmvnq_s64(int64x2_t a)
681 int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
682 return veorq_s64(a, vx);
684 inline uint64x2_t vmvnq_u64(uint64x2_t a)
686 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
687 return veorq_u64(a, vx);
690 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
691 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
692 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
693 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
694 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
695 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
696 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
697 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
698 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
699 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
700 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
701 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
702 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
704 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
705 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
706 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
707 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
708 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
709 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
710 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
712 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
713 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
714 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
717 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
718 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
719 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
720 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
721 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
722 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
723 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
724 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
726 // TODO: absdiff for signed integers
727 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
728 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
729 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
730 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
732 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
735 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
736 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
738 return _Tpvec2(cast(intrin(a.val, b.val))); \
741 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
742 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
743 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
745 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
747 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
751 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
753 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
756 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
759 // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
760 // also adds FMA support both for single- and double-precision floating-point vectors
761 return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
763 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
767 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
769 return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
772 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
774 return v_fma(a, b, c);
777 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
779 return v_fma(a, b, c);
783 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
785 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
789 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
791 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
794 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
796 return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
799 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
801 return v_fma(a, b, c);
805 // trade efficiency for convenience
806 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
807 inline _Tpvec operator << (const _Tpvec& a, int n) \
808 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
809 inline _Tpvec operator >> (const _Tpvec& a, int n) \
810 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
811 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
812 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
813 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
814 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
815 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
816 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
818 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
819 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
820 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
821 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
822 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
823 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
824 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
825 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
827 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
828 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
829 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
830 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
831 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
832 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
834 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
835 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
836 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
837 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
838 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
839 { CV_UNUSED(b); return a; }
841 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
842 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
843 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
844 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
845 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
846 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
847 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
848 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
849 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
851 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
854 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
855 inline _Tpvec v_load(const _Tp* ptr) \
856 { return _Tpvec(vld1q_##suffix(ptr)); } \
857 inline _Tpvec v_load_aligned(const _Tp* ptr) \
858 { return _Tpvec(vld1q_##suffix(ptr)); } \
859 inline _Tpvec v_load_low(const _Tp* ptr) \
860 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
861 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
862 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
863 inline void v_store(_Tp* ptr, const _Tpvec& a) \
864 { vst1q_##suffix(ptr, a.val); } \
865 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
866 { vst1q_##suffix(ptr, a.val); } \
867 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
868 { vst1q_##suffix(ptr, a.val); } \
869 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
870 { vst1q_##suffix(ptr, a.val); } \
871 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
872 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
873 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
874 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
876 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
877 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
878 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
879 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
880 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
881 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
882 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
883 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
884 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
886 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
890 // Workaround for old comiplers
891 inline v_float16x8 v_load_f16(const short* ptr)
892 { return v_float16x8(cv_vld1q_f16(ptr)); }
893 inline v_float16x8 v_load_f16_aligned(const short* ptr)
894 { return v_float16x8(cv_vld1q_f16(ptr)); }
896 inline void v_store(short* ptr, const v_float16x8& a)
897 { cv_vst1q_f16(ptr, a.val); }
898 inline void v_store_aligned(short* ptr, const v_float16x8& a)
899 { cv_vst1q_f16(ptr, a.val); }
902 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
903 inline scalartype v_reduce_##func(const _Tpvec& a) \
905 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
906 a0 = vp##vectorfunc##_##suffix(a0, a0); \
907 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
910 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
911 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
912 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
913 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
914 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
915 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
917 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
918 inline scalartype v_reduce_##func(const _Tpvec& a) \
920 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
921 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
924 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
925 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
926 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
927 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
928 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
929 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
930 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
931 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
932 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
934 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
935 const v_float32x4& c, const v_float32x4& d)
937 float32x4x2_t ab = vtrnq_f32(a.val, b.val);
938 float32x4x2_t cd = vtrnq_f32(c.val, d.val);
940 float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
941 float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
943 float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
944 float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
946 return v_float32x4(vaddq_f32(v0, v1));
949 #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
950 inline v_uint32x4 v_popcount(const _Tpvec& a) \
952 uint8x16_t t = vcntq_u8(cast(a.val)); \
953 uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \
954 uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
955 return v_uint32x4(t1); \
958 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
959 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
960 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
961 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
962 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
963 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
965 inline int v_signmask(const v_uint8x16& a)
967 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
968 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
969 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
970 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
972 inline int v_signmask(const v_int8x16& a)
973 { return v_signmask(v_reinterpret_as_u8(a)); }
975 inline int v_signmask(const v_uint16x8& a)
977 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
978 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
979 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
980 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
982 inline int v_signmask(const v_int16x8& a)
983 { return v_signmask(v_reinterpret_as_u16(a)); }
985 inline int v_signmask(const v_uint32x4& a)
987 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
988 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
989 uint64x2_t v1 = vpaddlq_u32(v0);
990 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
992 inline int v_signmask(const v_int32x4& a)
993 { return v_signmask(v_reinterpret_as_u32(a)); }
994 inline int v_signmask(const v_float32x4& a)
995 { return v_signmask(v_reinterpret_as_u32(a)); }
997 inline int v_signmask(const v_uint64x2& a)
999 int64x1_t m0 = vdup_n_s64(0);
1000 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1001 return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1003 inline int v_signmask(const v_float64x2& a)
1004 { return v_signmask(v_reinterpret_as_u64(a)); }
1007 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1008 inline bool v_check_all(const v_##_Tpvec& a) \
1010 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1011 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1012 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1014 inline bool v_check_any(const v_##_Tpvec& a) \
1016 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1017 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1018 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1021 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1022 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1023 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1025 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63)
1028 inline bool v_check_all(const v_int8x16& a)
1029 { return v_check_all(v_reinterpret_as_u8(a)); }
1030 inline bool v_check_all(const v_int16x8& a)
1031 { return v_check_all(v_reinterpret_as_u16(a)); }
1032 inline bool v_check_all(const v_int32x4& a)
1033 { return v_check_all(v_reinterpret_as_u32(a)); }
1034 inline bool v_check_all(const v_float32x4& a)
1035 { return v_check_all(v_reinterpret_as_u32(a)); }
1037 inline bool v_check_any(const v_int8x16& a)
1038 { return v_check_any(v_reinterpret_as_u8(a)); }
1039 inline bool v_check_any(const v_int16x8& a)
1040 { return v_check_any(v_reinterpret_as_u16(a)); }
1041 inline bool v_check_any(const v_int32x4& a)
1042 { return v_check_any(v_reinterpret_as_u32(a)); }
1043 inline bool v_check_any(const v_float32x4& a)
1044 { return v_check_any(v_reinterpret_as_u32(a)); }
1047 inline bool v_check_all(const v_int64x2& a)
1048 { return v_check_all(v_reinterpret_as_u64(a)); }
1049 inline bool v_check_all(const v_float64x2& a)
1050 { return v_check_all(v_reinterpret_as_u64(a)); }
1051 inline bool v_check_any(const v_int64x2& a)
1052 { return v_check_any(v_reinterpret_as_u64(a)); }
1053 inline bool v_check_any(const v_float64x2& a)
1054 { return v_check_any(v_reinterpret_as_u64(a)); }
1057 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1058 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1060 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1063 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1064 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1065 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1066 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1067 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1068 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1069 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1071 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1074 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1075 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1077 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1078 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1080 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1082 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1085 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1086 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1087 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1088 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1089 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1090 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1092 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1094 uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
1095 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1096 return v_uint32x4(vmovl_u16(v1));
1099 inline v_int32x4 v_load_expand_q(const schar* ptr)
1101 int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
1102 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1103 return v_int32x4(vmovl_s16(v1));
1106 #if defined(__aarch64__)
1107 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1108 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1110 b0.val = vzip1q_##suffix(a0.val, a1.val); \
1111 b1.val = vzip2q_##suffix(a0.val, a1.val); \
1113 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1115 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1117 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1119 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1121 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1123 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1124 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1127 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1128 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1130 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1131 b0.val = p.val[0]; \
1132 b1.val = p.val[1]; \
1134 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1136 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1138 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1140 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1142 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1144 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1145 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1149 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1150 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1151 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1152 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1153 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1154 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1155 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1157 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1160 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1162 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1164 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1167 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1168 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1169 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1170 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1171 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1172 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1173 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1174 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1175 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1177 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1181 inline v_int32x4 v_round(const v_float32x4& a)
1183 float32x4_t a_ = a.val;
1185 __asm__ ("fcvtns %0.4s, %1.4s"
1188 : /* No clobbers */);
1189 return v_int32x4(result);
1192 inline v_int32x4 v_round(const v_float32x4& a)
1194 static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1195 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1197 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1198 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1201 inline v_int32x4 v_floor(const v_float32x4& a)
1203 int32x4_t a1 = vcvtq_s32_f32(a.val);
1204 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1205 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1208 inline v_int32x4 v_ceil(const v_float32x4& a)
1210 int32x4_t a1 = vcvtq_s32_f32(a.val);
1211 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1212 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1215 inline v_int32x4 v_trunc(const v_float32x4& a)
1216 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1219 inline v_int32x4 v_round(const v_float64x2& a)
1221 static const int32x2_t zero = vdup_n_s32(0);
1222 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1225 inline v_int32x4 v_floor(const v_float64x2& a)
1227 static const int32x2_t zero = vdup_n_s32(0);
1228 int64x2_t a1 = vcvtq_s64_f64(a.val);
1229 uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1230 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1231 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1234 inline v_int32x4 v_ceil(const v_float64x2& a)
1236 static const int32x2_t zero = vdup_n_s32(0);
1237 int64x2_t a1 = vcvtq_s64_f64(a.val);
1238 uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1239 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
1240 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1243 inline v_int32x4 v_trunc(const v_float64x2& a)
1245 static const int32x2_t zero = vdup_n_s32(0);
1246 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1250 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
1251 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1252 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1253 v_##_Tpvec& b0, v_##_Tpvec& b1, \
1254 v_##_Tpvec& b2, v_##_Tpvec& b3) \
1256 /* m00 m01 m02 m03 */ \
1257 /* m10 m11 m12 m13 */ \
1258 /* m20 m21 m22 m23 */ \
1259 /* m30 m31 m32 m33 */ \
1260 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
1261 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
1262 /* m00 m10 m02 m12 */ \
1263 /* m01 m11 m03 m13 */ \
1264 /* m20 m30 m22 m32 */ \
1265 /* m21 m31 m23 m33 */ \
1266 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
1267 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
1268 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
1269 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
1272 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
1273 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
1274 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
1276 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
1277 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1279 _Tpvec##x2_t v = vld2q_##suffix(ptr); \
1283 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1285 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
1290 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1291 v_##_Tpvec& c, v_##_Tpvec& d) \
1293 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
1299 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1300 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1305 vst2q_##suffix(ptr, v); \
1307 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1308 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1314 vst3q_##suffix(ptr, v); \
1316 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1317 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1318 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1325 vst4q_##suffix(ptr, v); \
1328 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
1329 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
1331 tp##x1_t a0 = vld1_##suffix(ptr); \
1332 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1333 tp##x1_t a1 = vld1_##suffix(ptr + 2); \
1334 tp##x1_t b1 = vld1_##suffix(ptr + 3); \
1335 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1336 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1339 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
1340 v_##tp##x2& b, v_##tp##x2& c ) \
1342 tp##x1_t a0 = vld1_##suffix(ptr); \
1343 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1344 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1345 tp##x1_t a1 = vld1_##suffix(ptr + 3); \
1346 tp##x1_t b1 = vld1_##suffix(ptr + 4); \
1347 tp##x1_t c1 = vld1_##suffix(ptr + 5); \
1348 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1349 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1350 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1353 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
1354 v_##tp##x2& c, v_##tp##x2& d ) \
1356 tp##x1_t a0 = vld1_##suffix(ptr); \
1357 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1358 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1359 tp##x1_t d0 = vld1_##suffix(ptr + 3); \
1360 tp##x1_t a1 = vld1_##suffix(ptr + 4); \
1361 tp##x1_t b1 = vld1_##suffix(ptr + 5); \
1362 tp##x1_t c1 = vld1_##suffix(ptr + 6); \
1363 tp##x1_t d1 = vld1_##suffix(ptr + 7); \
1364 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1365 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1366 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1367 d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
1370 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1371 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1373 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1374 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1375 vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
1376 vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
1379 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
1380 const v_##tp##x2& b, const v_##tp##x2& c, \
1381 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1383 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1384 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1385 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1386 vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
1387 vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
1388 vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
1391 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1392 const v_##tp##x2& c, const v_##tp##x2& d, \
1393 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1395 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1396 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1397 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1398 vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
1399 vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
1400 vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
1401 vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
1402 vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
1405 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
1406 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
1407 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
1408 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
1409 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
1410 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
1411 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
1413 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
1416 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
1417 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
1419 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1421 return v_float32x4(vcvtq_f32_s32(a.val));
1425 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1427 float32x2_t zero = vdup_n_f32(0.0f);
1428 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
1431 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1433 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
1436 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1438 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
1441 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1443 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
1446 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1448 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
1451 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1453 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
1458 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
1460 return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
1462 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
1464 return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
1467 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
1469 return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
1473 ////////////// Lookup table access ////////////////////
1475 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1477 int CV_DECL_ALIGNED(32) elems[4] =
1479 tab[vgetq_lane_s32(idxvec.val, 0)],
1480 tab[vgetq_lane_s32(idxvec.val, 1)],
1481 tab[vgetq_lane_s32(idxvec.val, 2)],
1482 tab[vgetq_lane_s32(idxvec.val, 3)]
1484 return v_int32x4(vld1q_s32(elems));
1487 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1489 float CV_DECL_ALIGNED(32) elems[4] =
1491 tab[vgetq_lane_s32(idxvec.val, 0)],
1492 tab[vgetq_lane_s32(idxvec.val, 1)],
1493 tab[vgetq_lane_s32(idxvec.val, 2)],
1494 tab[vgetq_lane_s32(idxvec.val, 3)]
1496 return v_float32x4(vld1q_f32(elems));
1499 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1501 /*int CV_DECL_ALIGNED(32) idx[4];
1502 v_store(idx, idxvec);
1504 float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
1505 float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
1507 float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
1508 x = v_float32x4(xxyy.val[0]);
1509 y = v_float32x4(xxyy.val[1]);*/
1510 int CV_DECL_ALIGNED(32) idx[4];
1511 v_store_aligned(idx, idxvec);
1513 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1514 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1518 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1520 double CV_DECL_ALIGNED(32) elems[2] =
1522 tab[vgetq_lane_s32(idxvec.val, 0)],
1523 tab[vgetq_lane_s32(idxvec.val, 1)],
1525 return v_float64x2(vld1q_f64(elems));
1528 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1530 int CV_DECL_ALIGNED(32) idx[4];
1531 v_store_aligned(idx, idxvec);
1533 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1534 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1538 inline void v_cleanup() {}
1540 //! @name Check SIMD support
1542 //! @brief Check CPU capability of SIMD operation
1543 static inline bool hasSIMD128()
1545 return (CV_CPU_HAS_SUPPORT_NEON) ? true : false;
1550 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END