1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
49 #include "opencv2/core/utility.hpp"
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
59 #if defined(__aarch64__)
60 #define CV_SIMD128_64F 1
62 #define CV_SIMD128_64F 0
66 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
67 template <typename T> static inline \
68 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
69 template <typename T> static inline \
70 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
71 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
72 OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
73 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
74 OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
75 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
76 OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
77 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
78 OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
79 OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
84 typedef uchar lane_type;
88 explicit v_uint8x16(uint8x16_t v) : val(v) {}
89 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
90 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
92 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
97 return vgetq_lane_u8(val, 0);
105 typedef schar lane_type;
106 enum { nlanes = 16 };
109 explicit v_int8x16(int8x16_t v) : val(v) {}
110 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
111 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
113 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
118 return vgetq_lane_s8(val, 0);
126 typedef ushort lane_type;
130 explicit v_uint16x8(uint16x8_t v) : val(v) {}
131 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
133 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
138 return vgetq_lane_u16(val, 0);
146 typedef short lane_type;
150 explicit v_int16x8(int16x8_t v) : val(v) {}
151 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
153 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
158 return vgetq_lane_s16(val, 0);
166 typedef unsigned lane_type;
170 explicit v_uint32x4(uint32x4_t v) : val(v) {}
171 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
173 unsigned v[] = {v0, v1, v2, v3};
176 unsigned get0() const
178 return vgetq_lane_u32(val, 0);
186 typedef int lane_type;
190 explicit v_int32x4(int32x4_t v) : val(v) {}
191 v_int32x4(int v0, int v1, int v2, int v3)
193 int v[] = {v0, v1, v2, v3};
198 return vgetq_lane_s32(val, 0);
205 typedef float lane_type;
209 explicit v_float32x4(float32x4_t v) : val(v) {}
210 v_float32x4(float v0, float v1, float v2, float v3)
212 float v[] = {v0, v1, v2, v3};
217 return vgetq_lane_f32(val, 0);
224 typedef uint64 lane_type;
228 explicit v_uint64x2(uint64x2_t v) : val(v) {}
229 v_uint64x2(uint64 v0, uint64 v1)
231 uint64 v[] = {v0, v1};
236 return vgetq_lane_u64(val, 0);
243 typedef int64 lane_type;
247 explicit v_int64x2(int64x2_t v) : val(v) {}
248 v_int64x2(int64 v0, int64 v1)
250 int64 v[] = {v0, v1};
255 return vgetq_lane_s64(val, 0);
263 typedef double lane_type;
267 explicit v_float64x2(float64x2_t v) : val(v) {}
268 v_float64x2(double v0, double v1)
270 double v[] = {v0, v1};
275 return vgetq_lane_f64(val, 0);
282 // Workaround for old compilers
283 static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
284 static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
285 static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
286 static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
288 static inline float16x8_t cv_vld1q_f16(const void* ptr)
290 #ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
291 return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
293 return vld1q_f16((const __fp16*)ptr);
296 static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
298 #ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
299 vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
301 vst1q_f16((__fp16*)ptr, a);
305 static inline float16x4_t cv_vld1_f16(const void* ptr)
307 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
308 return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
310 return vld1_f16((const __fp16*)ptr);
313 static inline void cv_vst1_f16(void* ptr, float16x4_t a)
315 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
316 vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
318 vst1_f16((__fp16*)ptr, a);
325 typedef short lane_type;
329 explicit v_float16x8(float16x8_t v) : val(v) {}
330 v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
332 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
333 val = cv_vld1q_f16(v);
337 return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
342 inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
343 inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
346 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
347 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
348 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
349 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
350 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
351 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
352 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
353 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
354 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
355 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
356 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
357 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
358 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
360 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
361 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
362 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
363 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
364 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
365 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
366 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
367 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
368 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
370 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
371 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
372 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
373 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
374 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
375 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
376 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
377 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
378 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
379 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
380 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
381 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
382 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
385 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
386 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
388 hreg a1 = mov(a.val), b1 = mov(b.val); \
389 return _Tpvec(vcombine_##suffix(a1, b1)); \
391 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
393 hreg a1 = mov(a.val); \
394 vst1_##suffix(ptr, a1); \
396 template<int n> inline \
397 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
399 hreg a1 = rshr(a.val, n); \
400 hreg b1 = rshr(b.val, n); \
401 return _Tpvec(vcombine_##suffix(a1, b1)); \
403 template<int n> inline \
404 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
406 hreg a1 = rshr(a.val, n); \
407 vst1_##suffix(ptr, a1); \
410 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
411 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
412 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
413 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
414 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
415 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
417 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
418 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
420 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
421 const v_float32x4& m1, const v_float32x4& m2,
422 const v_float32x4& m3)
424 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
425 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
426 res = vmlaq_lane_f32(res, m1.val, vl, 1);
427 res = vmlaq_lane_f32(res, m2.val, vh, 0);
428 res = vmlaq_lane_f32(res, m3.val, vh, 1);
429 return v_float32x4(res);
432 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
433 const v_float32x4& m1, const v_float32x4& m2,
434 const v_float32x4& a)
436 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
437 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
438 res = vmlaq_lane_f32(res, m1.val, vl, 1);
439 res = vmlaq_lane_f32(res, m2.val, vh, 0);
440 res = vaddq_f32(res, a.val);
441 return v_float32x4(res);
444 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
445 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
447 return _Tpvec(intrin(a.val, b.val)); \
449 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
451 a.val = intrin(a.val, b.val); \
455 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
456 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
457 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
458 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
459 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
460 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
461 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
462 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
463 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
464 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
465 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
466 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
467 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
468 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
469 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
470 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
471 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
472 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
473 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
474 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
475 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
476 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
477 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
479 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
480 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
481 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
482 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
483 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
485 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
487 float32x4_t reciprocal = vrecpeq_f32(b.val);
488 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
489 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
490 return v_float32x4(vmulq_f32(a.val, reciprocal));
492 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
494 float32x4_t reciprocal = vrecpeq_f32(b.val);
495 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
496 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
497 a.val = vmulq_f32(a.val, reciprocal);
502 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
503 v_int32x4& c, v_int32x4& d)
505 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
506 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
509 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
510 v_uint32x4& c, v_uint32x4& d)
512 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
513 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
516 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
517 v_uint64x2& c, v_uint64x2& d)
519 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
520 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
523 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
525 int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
526 int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
527 int32x4x2_t cd = vuzpq_s32(c, d);
528 return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
531 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
533 v_int32x4 s = v_dotprod(a, b);
534 return v_int32x4(vaddq_s32(s.val , c.val));
537 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
538 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
539 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
540 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
541 inline _Tpvec operator ~ (const _Tpvec& a) \
543 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
546 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
547 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
548 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
549 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
550 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
551 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
552 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
553 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
555 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
556 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
558 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
560 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
562 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
566 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
567 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
568 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
570 inline v_float32x4 operator ~ (const v_float32x4& a)
572 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
576 inline v_float32x4 v_sqrt(const v_float32x4& x)
578 return v_float32x4(vsqrtq_f32(x.val));
581 inline v_float32x4 v_invsqrt(const v_float32x4& x)
583 v_float32x4 one = v_setall_f32(1.0f);
584 return one / v_sqrt(x);
587 inline v_float32x4 v_sqrt(const v_float32x4& x)
589 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
590 float32x4_t e = vrsqrteq_f32(x1);
591 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
592 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
593 return v_float32x4(vmulq_f32(x.val, e));
596 inline v_float32x4 v_invsqrt(const v_float32x4& x)
598 float32x4_t e = vrsqrteq_f32(x.val);
599 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
600 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
601 return v_float32x4(e);
605 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
606 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
608 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
609 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
610 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
612 inline v_float32x4 v_abs(v_float32x4 x)
613 { return v_float32x4(vabsq_f32(x.val)); }
616 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
617 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
619 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
621 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
623 a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
627 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
628 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
629 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
631 inline v_float64x2 operator ~ (const v_float64x2& a)
633 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
636 inline v_float64x2 v_sqrt(const v_float64x2& x)
638 return v_float64x2(vsqrtq_f64(x.val));
641 inline v_float64x2 v_invsqrt(const v_float64x2& x)
643 v_float64x2 one = v_setall_f64(1.0f);
644 return one / v_sqrt(x);
647 inline v_float64x2 v_abs(v_float64x2 x)
648 { return v_float64x2(vabsq_f64(x.val)); }
651 // TODO: exp, log, sin, cos
653 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
654 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
656 return _Tpvec(intrin(a.val, b.val)); \
659 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
660 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
661 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
662 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
663 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
664 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
665 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
666 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
667 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
668 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
669 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
670 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
671 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
672 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
674 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
675 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
679 inline int64x2_t vmvnq_s64(int64x2_t a)
681 int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
682 return veorq_s64(a, vx);
684 inline uint64x2_t vmvnq_u64(uint64x2_t a)
686 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
687 return veorq_u64(a, vx);
690 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
691 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
692 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
693 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
694 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
695 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
696 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
697 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
698 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
699 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
700 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
701 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
702 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
704 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
705 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
706 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
707 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
708 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
709 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
710 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
712 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
713 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
714 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
717 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
718 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
719 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
720 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
721 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
722 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
723 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
724 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
726 // TODO: absdiff for signed integers
727 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
728 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
729 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
730 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
732 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
735 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
736 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
738 return _Tpvec2(cast(intrin(a.val, b.val))); \
741 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
742 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
743 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
745 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
747 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
751 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
753 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
756 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
759 // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
760 // also adds FMA support both for single- and double-precision floating-point vectors
761 return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
763 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
767 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
769 return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
772 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
774 return v_fma(a, b, c);
777 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
779 return v_fma(a, b, c);
783 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
785 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
789 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
791 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
794 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
796 return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
799 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
801 return v_fma(a, b, c);
805 // trade efficiency for convenience
806 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
807 inline _Tpvec operator << (const _Tpvec& a, int n) \
808 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
809 inline _Tpvec operator >> (const _Tpvec& a, int n) \
810 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
811 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
812 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
813 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
814 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
815 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
816 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
818 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
819 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
820 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
821 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
822 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
823 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
824 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
825 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
827 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
828 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
829 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
830 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
831 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
832 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
834 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
835 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
836 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
837 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
838 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
839 { CV_UNUSED(b); return a; }
841 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
842 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
843 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
844 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
845 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
846 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
847 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
848 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
849 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
851 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
854 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
855 inline _Tpvec v_load(const _Tp* ptr) \
856 { return _Tpvec(vld1q_##suffix(ptr)); } \
857 inline _Tpvec v_load_aligned(const _Tp* ptr) \
858 { return _Tpvec(vld1q_##suffix(ptr)); } \
859 inline _Tpvec v_load_low(const _Tp* ptr) \
860 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
861 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
862 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
863 inline void v_store(_Tp* ptr, const _Tpvec& a) \
864 { vst1q_##suffix(ptr, a.val); } \
865 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
866 { vst1q_##suffix(ptr, a.val); } \
867 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
868 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
869 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
870 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
872 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
873 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
874 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
875 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
876 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
877 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
878 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
879 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
880 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
882 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
886 // Workaround for old comiplers
887 inline v_float16x8 v_load_f16(const short* ptr)
888 { return v_float16x8(cv_vld1q_f16(ptr)); }
889 inline v_float16x8 v_load_f16_aligned(const short* ptr)
890 { return v_float16x8(cv_vld1q_f16(ptr)); }
892 inline void v_store(short* ptr, const v_float16x8& a)
893 { cv_vst1q_f16(ptr, a.val); }
894 inline void v_store_aligned(short* ptr, const v_float16x8& a)
895 { cv_vst1q_f16(ptr, a.val); }
898 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
899 inline scalartype v_reduce_##func(const _Tpvec& a) \
901 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
902 a0 = vp##vectorfunc##_##suffix(a0, a0); \
903 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
906 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
907 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
908 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
909 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
910 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
911 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
913 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
914 inline scalartype v_reduce_##func(const _Tpvec& a) \
916 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
917 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
920 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
921 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
922 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
923 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
924 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
925 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
926 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
927 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
928 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
930 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
931 const v_float32x4& c, const v_float32x4& d)
933 float32x4x2_t ab = vtrnq_f32(a.val, b.val);
934 float32x4x2_t cd = vtrnq_f32(c.val, d.val);
936 float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
937 float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
939 float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
940 float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
942 return v_float32x4(vaddq_f32(v0, v1));
945 #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
946 inline v_uint32x4 v_popcount(const _Tpvec& a) \
948 uint8x16_t t = vcntq_u8(cast(a.val)); \
949 uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \
950 uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
951 return v_uint32x4(t1); \
954 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
955 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
956 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
957 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
958 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
959 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
961 inline int v_signmask(const v_uint8x16& a)
963 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
964 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
965 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
966 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
968 inline int v_signmask(const v_int8x16& a)
969 { return v_signmask(v_reinterpret_as_u8(a)); }
971 inline int v_signmask(const v_uint16x8& a)
973 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
974 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
975 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
976 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
978 inline int v_signmask(const v_int16x8& a)
979 { return v_signmask(v_reinterpret_as_u16(a)); }
981 inline int v_signmask(const v_uint32x4& a)
983 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
984 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
985 uint64x2_t v1 = vpaddlq_u32(v0);
986 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
988 inline int v_signmask(const v_int32x4& a)
989 { return v_signmask(v_reinterpret_as_u32(a)); }
990 inline int v_signmask(const v_float32x4& a)
991 { return v_signmask(v_reinterpret_as_u32(a)); }
993 inline int v_signmask(const v_uint64x2& a)
995 int64x1_t m0 = vdup_n_s64(0);
996 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
997 return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
999 inline int v_signmask(const v_float64x2& a)
1000 { return v_signmask(v_reinterpret_as_u64(a)); }
1003 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1004 inline bool v_check_all(const v_##_Tpvec& a) \
1006 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1007 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1008 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1010 inline bool v_check_any(const v_##_Tpvec& a) \
1012 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1013 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1014 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1017 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1018 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1019 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1021 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63)
1024 inline bool v_check_all(const v_int8x16& a)
1025 { return v_check_all(v_reinterpret_as_u8(a)); }
1026 inline bool v_check_all(const v_int16x8& a)
1027 { return v_check_all(v_reinterpret_as_u16(a)); }
1028 inline bool v_check_all(const v_int32x4& a)
1029 { return v_check_all(v_reinterpret_as_u32(a)); }
1030 inline bool v_check_all(const v_float32x4& a)
1031 { return v_check_all(v_reinterpret_as_u32(a)); }
1033 inline bool v_check_any(const v_int8x16& a)
1034 { return v_check_any(v_reinterpret_as_u8(a)); }
1035 inline bool v_check_any(const v_int16x8& a)
1036 { return v_check_any(v_reinterpret_as_u16(a)); }
1037 inline bool v_check_any(const v_int32x4& a)
1038 { return v_check_any(v_reinterpret_as_u32(a)); }
1039 inline bool v_check_any(const v_float32x4& a)
1040 { return v_check_any(v_reinterpret_as_u32(a)); }
1043 inline bool v_check_all(const v_int64x2& a)
1044 { return v_check_all(v_reinterpret_as_u64(a)); }
1045 inline bool v_check_all(const v_float64x2& a)
1046 { return v_check_all(v_reinterpret_as_u64(a)); }
1047 inline bool v_check_any(const v_int64x2& a)
1048 { return v_check_any(v_reinterpret_as_u64(a)); }
1049 inline bool v_check_any(const v_float64x2& a)
1050 { return v_check_any(v_reinterpret_as_u64(a)); }
1053 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1054 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1056 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1059 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1060 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1061 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1062 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1063 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1064 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1065 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1067 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1070 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1071 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1073 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1074 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1076 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1078 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1081 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1082 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1083 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1084 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1085 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1086 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1088 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1090 uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
1091 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1092 return v_uint32x4(vmovl_u16(v1));
1095 inline v_int32x4 v_load_expand_q(const schar* ptr)
1097 int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
1098 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1099 return v_int32x4(vmovl_s16(v1));
1102 #if defined(__aarch64__)
1103 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1104 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1106 b0.val = vzip1q_##suffix(a0.val, a1.val); \
1107 b1.val = vzip2q_##suffix(a0.val, a1.val); \
1109 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1111 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1113 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1115 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1117 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1119 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1120 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1123 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1124 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1126 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1127 b0.val = p.val[0]; \
1128 b1.val = p.val[1]; \
1130 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1132 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1134 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1136 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1138 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1140 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1141 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1145 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1146 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1147 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1148 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1149 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1150 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1151 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1153 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1156 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1158 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1160 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1163 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1164 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1165 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1166 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1167 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1168 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1169 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1170 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1171 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1173 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1177 inline v_int32x4 v_round(const v_float32x4& a)
1179 float32x4_t a_ = a.val;
1181 __asm__ ("fcvtns %0.4s, %1.4s"
1184 : /* No clobbers */);
1185 return v_int32x4(result);
1188 inline v_int32x4 v_round(const v_float32x4& a)
1190 static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1191 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1193 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1194 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1197 inline v_int32x4 v_floor(const v_float32x4& a)
1199 int32x4_t a1 = vcvtq_s32_f32(a.val);
1200 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1201 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1204 inline v_int32x4 v_ceil(const v_float32x4& a)
1206 int32x4_t a1 = vcvtq_s32_f32(a.val);
1207 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1208 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1211 inline v_int32x4 v_trunc(const v_float32x4& a)
1212 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1215 inline v_int32x4 v_round(const v_float64x2& a)
1217 static const int32x2_t zero = vdup_n_s32(0);
1218 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1221 inline v_int32x4 v_floor(const v_float64x2& a)
1223 static const int32x2_t zero = vdup_n_s32(0);
1224 int64x2_t a1 = vcvtq_s64_f64(a.val);
1225 uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1226 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1227 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1230 inline v_int32x4 v_ceil(const v_float64x2& a)
1232 static const int32x2_t zero = vdup_n_s32(0);
1233 int64x2_t a1 = vcvtq_s64_f64(a.val);
1234 uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1235 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
1236 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1239 inline v_int32x4 v_trunc(const v_float64x2& a)
1241 static const int32x2_t zero = vdup_n_s32(0);
1242 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1246 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
1247 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1248 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1249 v_##_Tpvec& b0, v_##_Tpvec& b1, \
1250 v_##_Tpvec& b2, v_##_Tpvec& b3) \
1252 /* m00 m01 m02 m03 */ \
1253 /* m10 m11 m12 m13 */ \
1254 /* m20 m21 m22 m23 */ \
1255 /* m30 m31 m32 m33 */ \
1256 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
1257 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
1258 /* m00 m10 m02 m12 */ \
1259 /* m01 m11 m03 m13 */ \
1260 /* m20 m30 m22 m32 */ \
1261 /* m21 m31 m23 m33 */ \
1262 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
1263 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
1264 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
1265 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
1268 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
1269 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
1270 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
1272 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
1273 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1275 _Tpvec##x2_t v = vld2q_##suffix(ptr); \
1279 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1281 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
1286 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1287 v_##_Tpvec& c, v_##_Tpvec& d) \
1289 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
1295 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
1300 vst2q_##suffix(ptr, v); \
1302 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
1308 vst3q_##suffix(ptr, v); \
1310 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1311 const v_##_Tpvec& c, const v_##_Tpvec& d) \
1318 vst4q_##suffix(ptr, v); \
1321 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
1322 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
1323 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
1324 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
1325 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
1326 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
1327 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
1329 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
1332 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1334 return v_float32x4(vcvtq_f32_s32(a.val));
1338 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1340 float32x2_t zero = vdup_n_f32(0.0f);
1341 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
1344 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1346 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
1349 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1351 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
1354 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1356 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
1359 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1361 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
1364 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1366 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
1371 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
1373 return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
1375 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
1377 return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
1380 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
1382 return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
1386 ////////////// Lookup table access ////////////////////
1388 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1390 int CV_DECL_ALIGNED(32) elems[4] =
1392 tab[vgetq_lane_s32(idxvec.val, 0)],
1393 tab[vgetq_lane_s32(idxvec.val, 1)],
1394 tab[vgetq_lane_s32(idxvec.val, 2)],
1395 tab[vgetq_lane_s32(idxvec.val, 3)]
1397 return v_int32x4(vld1q_s32(elems));
1400 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1402 float CV_DECL_ALIGNED(32) elems[4] =
1404 tab[vgetq_lane_s32(idxvec.val, 0)],
1405 tab[vgetq_lane_s32(idxvec.val, 1)],
1406 tab[vgetq_lane_s32(idxvec.val, 2)],
1407 tab[vgetq_lane_s32(idxvec.val, 3)]
1409 return v_float32x4(vld1q_f32(elems));
1412 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1414 /*int CV_DECL_ALIGNED(32) idx[4];
1415 v_store(idx, idxvec);
1417 float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
1418 float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
1420 float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
1421 x = v_float32x4(xxyy.val[0]);
1422 y = v_float32x4(xxyy.val[1]);*/
1423 int CV_DECL_ALIGNED(32) idx[4];
1424 v_store_aligned(idx, idxvec);
1426 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1427 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1431 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1433 double CV_DECL_ALIGNED(32) elems[2] =
1435 tab[vgetq_lane_s32(idxvec.val, 0)],
1436 tab[vgetq_lane_s32(idxvec.val, 1)],
1438 return v_float64x2(vld1q_f64(elems));
1441 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1443 int CV_DECL_ALIGNED(32) idx[4];
1444 v_store_aligned(idx, idxvec);
1446 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1447 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1451 inline void v_cleanup() {}
1453 //! @name Check SIMD support
1455 //! @brief Check CPU capability of SIMD operation
1456 static inline bool hasSIMD128()
1458 return (CV_CPU_HAS_SUPPORT_NEON) ? true : false;
1463 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END