4da389f48b151d287e38faaab4973660d13fc870
[platform/upstream/opencv.git] / modules / core / include / opencv2 / core / hal / intrin_neon.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
47
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50
51 namespace cv
52 {
53
54 //! @cond IGNORED
55
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
57
58 #define CV_SIMD128 1
59 #if defined(__aarch64__)
60 #define CV_SIMD128_64F 1
61 #else
62 #define CV_SIMD128_64F 0
63 #endif
64
65 // TODO
66 #define CV_NEON_DOT 0
67
68 //////////// Utils ////////////
69
70 #if CV_SIMD128_64F
71 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
72     inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
73     { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
74 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
75     inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
76     { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
77 #else
78 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
79     inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
80     { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
81 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
82     inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
83     { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
84 #endif
85
86 #if CV_SIMD128_64F
87 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
88     template <typename T> static inline \
89     _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
90     template <typename T> static inline \
91     float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
92 #else
93 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
94 #endif
95
96 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
97     OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
98     OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
99     OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
100
101 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
102     OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
103
104 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
105     OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
106
107 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8,  u8)
108 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16,  int8x8,   s8)
109 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
110 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8,  int16x4,  s16)
111 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
112 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4,  int32x2,  s32)
113 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
114 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
115 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
116 #if CV_SIMD128_64F
117 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
118 #endif
119
120 //////////// Types ////////////
121
122 struct v_uint8x16
123 {
124     typedef uchar lane_type;
125     enum { nlanes = 16 };
126
127     v_uint8x16() {}
128     explicit v_uint8x16(uint8x16_t v) : val(v) {}
129     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
130                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
131     {
132         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
133         val = vld1q_u8(v);
134     }
135     uchar get0() const
136     {
137         return vgetq_lane_u8(val, 0);
138     }
139
140     uint8x16_t val;
141 };
142
143 struct v_int8x16
144 {
145     typedef schar lane_type;
146     enum { nlanes = 16 };
147
148     v_int8x16() {}
149     explicit v_int8x16(int8x16_t v) : val(v) {}
150     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
151                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
152     {
153         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
154         val = vld1q_s8(v);
155     }
156     schar get0() const
157     {
158         return vgetq_lane_s8(val, 0);
159     }
160
161     int8x16_t val;
162 };
163
164 struct v_uint16x8
165 {
166     typedef ushort lane_type;
167     enum { nlanes = 8 };
168
169     v_uint16x8() {}
170     explicit v_uint16x8(uint16x8_t v) : val(v) {}
171     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
172     {
173         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
174         val = vld1q_u16(v);
175     }
176     ushort get0() const
177     {
178         return vgetq_lane_u16(val, 0);
179     }
180
181     uint16x8_t val;
182 };
183
184 struct v_int16x8
185 {
186     typedef short lane_type;
187     enum { nlanes = 8 };
188
189     v_int16x8() {}
190     explicit v_int16x8(int16x8_t v) : val(v) {}
191     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
192     {
193         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
194         val = vld1q_s16(v);
195     }
196     short get0() const
197     {
198         return vgetq_lane_s16(val, 0);
199     }
200
201     int16x8_t val;
202 };
203
204 struct v_uint32x4
205 {
206     typedef unsigned lane_type;
207     enum { nlanes = 4 };
208
209     v_uint32x4() {}
210     explicit v_uint32x4(uint32x4_t v) : val(v) {}
211     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
212     {
213         unsigned v[] = {v0, v1, v2, v3};
214         val = vld1q_u32(v);
215     }
216     unsigned get0() const
217     {
218         return vgetq_lane_u32(val, 0);
219     }
220
221     uint32x4_t val;
222 };
223
224 struct v_int32x4
225 {
226     typedef int lane_type;
227     enum { nlanes = 4 };
228
229     v_int32x4() {}
230     explicit v_int32x4(int32x4_t v) : val(v) {}
231     v_int32x4(int v0, int v1, int v2, int v3)
232     {
233         int v[] = {v0, v1, v2, v3};
234         val = vld1q_s32(v);
235     }
236     int get0() const
237     {
238         return vgetq_lane_s32(val, 0);
239     }
240     int32x4_t val;
241 };
242
243 struct v_float32x4
244 {
245     typedef float lane_type;
246     enum { nlanes = 4 };
247
248     v_float32x4() {}
249     explicit v_float32x4(float32x4_t v) : val(v) {}
250     v_float32x4(float v0, float v1, float v2, float v3)
251     {
252         float v[] = {v0, v1, v2, v3};
253         val = vld1q_f32(v);
254     }
255     float get0() const
256     {
257         return vgetq_lane_f32(val, 0);
258     }
259     float32x4_t val;
260 };
261
262 struct v_uint64x2
263 {
264     typedef uint64 lane_type;
265     enum { nlanes = 2 };
266
267     v_uint64x2() {}
268     explicit v_uint64x2(uint64x2_t v) : val(v) {}
269     v_uint64x2(uint64 v0, uint64 v1)
270     {
271         uint64 v[] = {v0, v1};
272         val = vld1q_u64(v);
273     }
274     uint64 get0() const
275     {
276         return vgetq_lane_u64(val, 0);
277     }
278     uint64x2_t val;
279 };
280
281 struct v_int64x2
282 {
283     typedef int64 lane_type;
284     enum { nlanes = 2 };
285
286     v_int64x2() {}
287     explicit v_int64x2(int64x2_t v) : val(v) {}
288     v_int64x2(int64 v0, int64 v1)
289     {
290         int64 v[] = {v0, v1};
291         val = vld1q_s64(v);
292     }
293     int64 get0() const
294     {
295         return vgetq_lane_s64(val, 0);
296     }
297     int64x2_t val;
298 };
299
300 #if CV_SIMD128_64F
301 struct v_float64x2
302 {
303     typedef double lane_type;
304     enum { nlanes = 2 };
305
306     v_float64x2() {}
307     explicit v_float64x2(float64x2_t v) : val(v) {}
308     v_float64x2(double v0, double v1)
309     {
310         double v[] = {v0, v1};
311         val = vld1q_f64(v);
312     }
313     double get0() const
314     {
315         return vgetq_lane_f64(val, 0);
316     }
317     float64x2_t val;
318 };
319 #endif
320
321 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
322 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
323 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
324 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
325 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
326 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
327 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
328 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
329 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
330 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
331 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
332 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
333 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
334
335 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
336 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
337 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
338 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
339 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
340 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
341 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
342 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
343 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
344 #if CV_SIMD128_64F
345 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
346 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
347 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
348 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
349 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
350 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
351 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
352 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
353 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
354 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
355 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
356 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
357 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
358 #endif
359
360 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
361 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
362 { \
363     hreg a1 = mov(a.val), b1 = mov(b.val); \
364     return _Tpvec(vcombine_##suffix(a1, b1)); \
365 } \
366 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
367 { \
368     hreg a1 = mov(a.val); \
369     vst1_##suffix(ptr, a1); \
370 } \
371 template<int n> inline \
372 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
373 { \
374     hreg a1 = rshr(a.val, n); \
375     hreg b1 = rshr(b.val, n); \
376     return _Tpvec(vcombine_##suffix(a1, b1)); \
377 } \
378 template<int n> inline \
379 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
380 { \
381     hreg a1 = rshr(a.val, n); \
382     vst1_##suffix(ptr, a1); \
383 }
384
385 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
386 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
387 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
388 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
389 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
390 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
391
392 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
393 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
394
395 // pack boolean
396 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
397 {
398     uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
399     return v_uint8x16(ab);
400 }
401
402 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
403                            const v_uint32x4& c, const v_uint32x4& d)
404 {
405     uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
406     uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
407     return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
408 }
409
410 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
411                            const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
412                            const v_uint64x2& g, const v_uint64x2& h)
413 {
414     uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
415     uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
416     uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
417     uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
418
419     uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
420     uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
421     return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
422 }
423
424 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
425                             const v_float32x4& m1, const v_float32x4& m2,
426                             const v_float32x4& m3)
427 {
428     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
429     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
430     res = vmlaq_lane_f32(res, m1.val, vl, 1);
431     res = vmlaq_lane_f32(res, m2.val, vh, 0);
432     res = vmlaq_lane_f32(res, m3.val, vh, 1);
433     return v_float32x4(res);
434 }
435
436 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
437                                const v_float32x4& m1, const v_float32x4& m2,
438                                const v_float32x4& a)
439 {
440     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
441     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
442     res = vmlaq_lane_f32(res, m1.val, vl, 1);
443     res = vmlaq_lane_f32(res, m2.val, vh, 0);
444     res = vaddq_f32(res, a.val);
445     return v_float32x4(res);
446 }
447
448 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
449 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
450 { \
451     return _Tpvec(intrin(a.val, b.val)); \
452 } \
453 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
454 { \
455     a.val = intrin(a.val, b.val); \
456     return a; \
457 }
458
459 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
460 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
461 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
462 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
463 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
464 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
465 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
466 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
467 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
468 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
469 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
470 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
471 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
472 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
473 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
474 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
475 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
476 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
477 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
478 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
479 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
480 #if CV_SIMD128_64F
481 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
482 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
483 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
484 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
485 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
486 #else
487 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
488 {
489     float32x4_t reciprocal = vrecpeq_f32(b.val);
490     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
491     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
492     return v_float32x4(vmulq_f32(a.val, reciprocal));
493 }
494 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
495 {
496     float32x4_t reciprocal = vrecpeq_f32(b.val);
497     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
498     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
499     a.val = vmulq_f32(a.val, reciprocal);
500     return a;
501 }
502 #endif
503
504 // saturating multiply 8-bit, 16-bit
505 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
506     inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
507     {                                                            \
508         _Tpwvec c, d;                                            \
509         v_mul_expand(a, b, c, d);                                \
510         return v_pack(c, d);                                     \
511     }                                                            \
512     inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
513     { a = a * b; return a; }
514
515 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
516 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
517 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8,  v_int32x4)
518 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
519
520 //  Multiply and expand
521 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
522                          v_int16x8& c, v_int16x8& d)
523 {
524     c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
525     d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
526 }
527
528 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
529                          v_uint16x8& c, v_uint16x8& d)
530 {
531     c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
532     d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
533 }
534
535 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
536                          v_int32x4& c, v_int32x4& d)
537 {
538     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
539     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
540 }
541
542 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
543                          v_uint32x4& c, v_uint32x4& d)
544 {
545     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
546     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
547 }
548
549 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
550                          v_uint64x2& c, v_uint64x2& d)
551 {
552     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
553     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
554 }
555
556 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
557 {
558     return v_int16x8(vcombine_s16(
559                                   vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
560                                   vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
561                                  ));
562 }
563 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
564 {
565     return v_uint16x8(vcombine_u16(
566                                    vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
567                                    vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
568                                   ));
569 }
570
571 //////// Dot Product ////////
572
573 // 16 >> 32
574 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
575 {
576     int16x8_t uzp1, uzp2;
577     _v128_unzip(a.val, b.val, uzp1, uzp2);
578     int16x4_t a0 = vget_low_s16(uzp1);
579     int16x4_t b0 = vget_high_s16(uzp1);
580     int16x4_t a1 = vget_low_s16(uzp2);
581     int16x4_t b1 = vget_high_s16(uzp2);
582     int32x4_t p = vmull_s16(a0, b0);
583     return v_int32x4(vmlal_s16(p, a1, b1));
584 }
585 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
586 {
587     int16x8_t uzp1, uzp2;
588     _v128_unzip(a.val, b.val, uzp1, uzp2);
589     int16x4_t a0 = vget_low_s16(uzp1);
590     int16x4_t b0 = vget_high_s16(uzp1);
591     int16x4_t a1 = vget_low_s16(uzp2);
592     int16x4_t b1 = vget_high_s16(uzp2);
593     int32x4_t p = vmlal_s16(c.val, a0, b0);
594     return v_int32x4(vmlal_s16(p, a1, b1));
595 }
596
597 // 32 >> 64
598 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
599 {
600     int32x4_t uzp1, uzp2;
601     _v128_unzip(a.val, b.val, uzp1, uzp2);
602     int32x2_t a0 = vget_low_s32(uzp1);
603     int32x2_t b0 = vget_high_s32(uzp1);
604     int32x2_t a1 = vget_low_s32(uzp2);
605     int32x2_t b1 = vget_high_s32(uzp2);
606     int64x2_t p = vmull_s32(a0, b0);
607     return v_int64x2(vmlal_s32(p, a1, b1));
608 }
609 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
610 {
611     int32x4_t uzp1, uzp2;
612     _v128_unzip(a.val, b.val, uzp1, uzp2);
613     int32x2_t a0 = vget_low_s32(uzp1);
614     int32x2_t b0 = vget_high_s32(uzp1);
615     int32x2_t a1 = vget_low_s32(uzp2);
616     int32x2_t b1 = vget_high_s32(uzp2);
617     int64x2_t p = vmlal_s32(c.val, a0, b0);
618     return v_int64x2(vmlal_s32(p, a1, b1));
619 }
620
621 // 8 >> 32
622 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
623 {
624 #if CV_NEON_DOT
625     return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
626 #else
627     const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
628     const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
629     const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
630     const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
631
632     uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
633                                 vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
634     uint16x8_t odd  = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
635                                 vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
636
637     uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
638                               vreinterpretq_u32_u16(vbslq_u16(mask32, odd,  zero32)));
639     uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
640                               vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
641     return v_uint32x4(vaddq_u32(s0, s1));
642 #endif
643 }
644 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
645                                    const v_uint32x4& c)
646 {
647 #if CV_NEON_DOT
648     return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
649 #else
650     return v_dotprod_expand(a, b) + c;
651 #endif
652 }
653
654 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
655 {
656 #if CV_NEON_DOT
657     return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
658 #else
659     int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
660     int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
661     int16x8_t uzp1, uzp2;
662     _v128_unzip(p0, p1, uzp1, uzp2);
663     int16x8_t sum = vaddq_s16(uzp1, uzp2);
664     int16x4_t uzpl1, uzpl2;
665     _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
666     return v_int32x4(vaddl_s16(uzpl1, uzpl2));
667 #endif
668 }
669 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
670                                   const v_int32x4& c)
671 {
672 #if CV_NEON_DOT
673     return v_int32x4(vdotq_s32(c.val, a.val, b.val));
674 #else
675     return v_dotprod_expand(a, b) + c;
676 #endif
677 }
678
679 // 16 >> 64
680 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
681 {
682     const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
683     const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
684
685     uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
686                                 vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
687     uint32x4_t odd  = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
688                                 vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
689     uint32x4_t uzp1, uzp2;
690     _v128_unzip(even, odd, uzp1, uzp2);
691     uint64x2_t s0  = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
692     uint64x2_t s1  = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
693     return v_uint64x2(vaddq_u64(s0, s1));
694 }
695 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
696 { return v_dotprod_expand(a, b) + c; }
697
698 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
699 {
700     int32x4_t p0  = vmull_s16(vget_low_s16(a.val),  vget_low_s16(b.val));
701     int32x4_t p1  = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
702
703     int32x4_t uzp1, uzp2;
704     _v128_unzip(p0, p1, uzp1, uzp2);
705     int32x4_t sum = vaddq_s32(uzp1, uzp2);
706
707     int32x2_t uzpl1, uzpl2;
708     _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
709     return v_int64x2(vaddl_s32(uzpl1, uzpl2));
710 }
711 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
712                                   const v_int64x2& c)
713 { return v_dotprod_expand(a, b) + c; }
714
715 // 32 >> 64f
716 #if CV_SIMD128_64F
717 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
718 { return v_cvt_f64(v_dotprod(a, b)); }
719 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
720                                     const v_float64x2& c)
721 { return v_dotprod_expand(a, b) + c; }
722 #endif
723
724 //////// Fast Dot Product ////////
725
726 // 16 >> 32
727 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
728 {
729     int16x4_t a0 = vget_low_s16(a.val);
730     int16x4_t a1 = vget_high_s16(a.val);
731     int16x4_t b0 = vget_low_s16(b.val);
732     int16x4_t b1 = vget_high_s16(b.val);
733     int32x4_t p = vmull_s16(a0, b0);
734     return v_int32x4(vmlal_s16(p, a1, b1));
735 }
736 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
737 {
738     int16x4_t a0 = vget_low_s16(a.val);
739     int16x4_t a1 = vget_high_s16(a.val);
740     int16x4_t b0 = vget_low_s16(b.val);
741     int16x4_t b1 = vget_high_s16(b.val);
742     int32x4_t p = vmlal_s16(c.val, a0, b0);
743     return v_int32x4(vmlal_s16(p, a1, b1));
744 }
745
746 // 32 >> 64
747 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
748 {
749     int32x2_t a0 = vget_low_s32(a.val);
750     int32x2_t a1 = vget_high_s32(a.val);
751     int32x2_t b0 = vget_low_s32(b.val);
752     int32x2_t b1 = vget_high_s32(b.val);
753     int64x2_t p = vmull_s32(a0, b0);
754     return v_int64x2(vmlal_s32(p, a1, b1));
755 }
756 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
757 {
758     int32x2_t a0 = vget_low_s32(a.val);
759     int32x2_t a1 = vget_high_s32(a.val);
760     int32x2_t b0 = vget_low_s32(b.val);
761     int32x2_t b1 = vget_high_s32(b.val);
762     int64x2_t p = vmlal_s32(c.val, a0, b0);
763     return v_int64x2(vmlal_s32(p, a1, b1));
764 }
765
766 // 8 >> 32
767 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
768 {
769 #if CV_NEON_DOT
770     return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
771 #else
772     uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
773     uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
774     uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
775     uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
776     return v_uint32x4(vaddq_u32(s0, s1));
777 #endif
778 }
779 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
780 {
781 #if CV_NEON_DOT
782     return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
783 #else
784     return v_dotprod_expand_fast(a, b) + c;
785 #endif
786 }
787
788 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
789 {
790 #if CV_NEON_DOT
791     return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
792 #else
793     int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
794     prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
795     return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
796 #endif
797 }
798 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
799 {
800 #if CV_NEON_DOT
801     return v_int32x4(vdotq_s32(c.val, a.val, b.val));
802 #else
803     return v_dotprod_expand_fast(a, b) + c;
804 #endif
805 }
806
807 // 16 >> 64
808 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
809 {
810     uint32x4_t p0  = vmull_u16(vget_low_u16(a.val),  vget_low_u16(b.val));
811     uint32x4_t p1  = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
812     uint64x2_t s0  = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
813     uint64x2_t s1  = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
814     return v_uint64x2(vaddq_u64(s0, s1));
815 }
816 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
817 { return v_dotprod_expand_fast(a, b) + c; }
818
819 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
820 {
821     int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
822     prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
823     return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
824 }
825 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
826 { return v_dotprod_expand_fast(a, b) + c; }
827
828 // 32 >> 64f
829 #if CV_SIMD128_64F
830 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
831 { return v_cvt_f64(v_dotprod_fast(a, b)); }
832 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
833 { return v_dotprod_expand_fast(a, b) + c; }
834 #endif
835
836
837 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
838     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
839     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
840     OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
841     inline _Tpvec operator ~ (const _Tpvec& a) \
842     { \
843         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
844     }
845
846 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
847 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
848 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
849 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
850 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
851 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
852 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
853 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
854
855 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
856 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
857 { \
858     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
859 } \
860 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
861 { \
862     a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
863     return a; \
864 }
865
866 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
867 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
868 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
869
870 inline v_float32x4 operator ~ (const v_float32x4& a)
871 {
872     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
873 }
874
875 #if CV_SIMD128_64F
876 inline v_float32x4 v_sqrt(const v_float32x4& x)
877 {
878     return v_float32x4(vsqrtq_f32(x.val));
879 }
880
881 inline v_float32x4 v_invsqrt(const v_float32x4& x)
882 {
883     v_float32x4 one = v_setall_f32(1.0f);
884     return one / v_sqrt(x);
885 }
886 #else
887 inline v_float32x4 v_sqrt(const v_float32x4& x)
888 {
889     float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
890     float32x4_t e = vrsqrteq_f32(x1);
891     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
892     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
893     return v_float32x4(vmulq_f32(x.val, e));
894 }
895
896 inline v_float32x4 v_invsqrt(const v_float32x4& x)
897 {
898     float32x4_t e = vrsqrteq_f32(x.val);
899     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
900     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
901     return v_float32x4(e);
902 }
903 #endif
904
905 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
906 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
907
908 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
909 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
910 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
911
912 inline v_float32x4 v_abs(v_float32x4 x)
913 { return v_float32x4(vabsq_f32(x.val)); }
914
915 #if CV_SIMD128_64F
916 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
917 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
918 { \
919     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
920 } \
921 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
922 { \
923     a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
924     return a; \
925 }
926
927 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
928 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
929 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
930
931 inline v_float64x2 operator ~ (const v_float64x2& a)
932 {
933     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
934 }
935
936 inline v_float64x2 v_sqrt(const v_float64x2& x)
937 {
938     return v_float64x2(vsqrtq_f64(x.val));
939 }
940
941 inline v_float64x2 v_invsqrt(const v_float64x2& x)
942 {
943     v_float64x2 one = v_setall_f64(1.0f);
944     return one / v_sqrt(x);
945 }
946
947 inline v_float64x2 v_abs(v_float64x2 x)
948 { return v_float64x2(vabsq_f64(x.val)); }
949 #endif
950
951 // TODO: exp, log, sin, cos
952
953 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
954 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
955 { \
956     return _Tpvec(intrin(a.val, b.val)); \
957 }
958
959 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
960 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
961 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
962 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
963 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
964 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
965 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
966 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
967 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
968 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
969 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
970 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
971 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
972 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
973 #if CV_SIMD128_64F
974 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
975 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
976 #endif
977
978 #if CV_SIMD128_64F
979 inline int64x2_t vmvnq_s64(int64x2_t a)
980 {
981     int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
982     return veorq_s64(a, vx);
983 }
984 inline uint64x2_t vmvnq_u64(uint64x2_t a)
985 {
986     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
987     return veorq_u64(a, vx);
988 }
989 #endif
990 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
991 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
992 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
993 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
994 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
995 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
996 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
997 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
998 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
999 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1000 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1001 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1002 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1003
1004 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1005 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
1006 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1007 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
1008 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1009 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
1010 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
1011 #if CV_SIMD128_64F
1012 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1013 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
1014 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
1015 #endif
1016
1017 inline v_float32x4 v_not_nan(const v_float32x4& a)
1018 { return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1019 #if CV_SIMD128_64F
1020 inline v_float64x2 v_not_nan(const v_float64x2& a)
1021 { return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1022 #endif
1023
1024 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
1025 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
1026 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
1027 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
1028 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
1029 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
1030 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
1031 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
1032 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
1033 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
1034 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
1035 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
1036
1037 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
1038 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
1039 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
1040 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
1041 #if CV_SIMD128_64F
1042 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
1043 #endif
1044
1045 /** Saturating absolute difference **/
1046 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1047 { return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
1048 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1049 { return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1050
1051 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1052 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1053 { \
1054     return _Tpvec2(cast(intrin(a.val, b.val))); \
1055 }
1056
1057 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
1058 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
1059 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
1060
1061 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1062 {
1063     v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1064     return v_sqrt(x);
1065 }
1066
1067 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1068 {
1069     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1070 }
1071
1072 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1073 {
1074 #if CV_SIMD128_64F
1075     // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
1076     // also adds FMA support both for single- and double-precision floating-point vectors
1077     return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1078 #else
1079     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1080 #endif
1081 }
1082
1083 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1084 {
1085     return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1086 }
1087
1088 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1089 {
1090     return v_fma(a, b, c);
1091 }
1092
1093 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1094 {
1095     return v_fma(a, b, c);
1096 }
1097
1098 #if CV_SIMD128_64F
1099 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1100 {
1101     v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1102     return v_sqrt(x);
1103 }
1104
1105 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1106 {
1107     return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1108 }
1109
1110 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1111 {
1112     return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1113 }
1114
1115 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1116 {
1117     return v_fma(a, b, c);
1118 }
1119 #endif
1120
1121 // trade efficiency for convenience
1122 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1123 inline _Tpvec operator << (const _Tpvec& a, int n) \
1124 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1125 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1126 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1127 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1128 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1129 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1130 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1131 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1132 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1133
1134 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
1135 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
1136 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
1137 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
1138 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
1139 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
1140 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
1141 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
1142
1143 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1144 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1145 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1146 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1147 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
1148 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1149 { return a; } \
1150 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1151 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1152 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1153 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
1154 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1155 { CV_UNUSED(b); return a; }
1156
1157 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
1158 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
1159 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
1160 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
1161 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
1162 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
1163 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
1164 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
1165 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
1166 #if CV_SIMD128_64F
1167 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
1168 #endif
1169
1170 #if defined(__clang__) && defined(__aarch64__)
1171 // avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
1172 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1173 inline _Tpvec v_load_low(const _Tp* ptr) \
1174 { \
1175 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1176 uint64 v = *(unaligned_uint64*)ptr; \
1177 return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1178 }
1179 #else
1180 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1181 inline _Tpvec v_load_low(const _Tp* ptr) \
1182 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1183 #endif
1184
1185 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1186 inline _Tpvec v_load(const _Tp* ptr) \
1187 { return _Tpvec(vld1q_##suffix(ptr)); } \
1188 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1189 { return _Tpvec(vld1q_##suffix(ptr)); } \
1190 OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1191 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1192 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1193 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1194 { vst1q_##suffix(ptr, a.val); } \
1195 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1196 { vst1q_##suffix(ptr, a.val); } \
1197 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1198 { vst1q_##suffix(ptr, a.val); } \
1199 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1200 { vst1q_##suffix(ptr, a.val); } \
1201 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1202 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1203 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1204 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1205
1206 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
1207 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
1208 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
1209 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
1210 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
1211 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
1212 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
1213 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
1214 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
1215 #if CV_SIMD128_64F
1216 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
1217 #endif
1218
1219 inline unsigned v_reduce_sum(const v_uint8x16& a)
1220 {
1221     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1222     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1223     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1224 }
1225 inline int v_reduce_sum(const v_int8x16& a)
1226 {
1227     int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1228     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1229     return vget_lane_s32(vpadd_s32(t1, t1), 0);
1230 }
1231 inline unsigned v_reduce_sum(const v_uint16x8& a)
1232 {
1233     uint32x4_t t0 = vpaddlq_u16(a.val);
1234     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1235     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1236 }
1237 inline int v_reduce_sum(const v_int16x8& a)
1238 {
1239     int32x4_t t0 = vpaddlq_s16(a.val);
1240     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1241     return vget_lane_s32(vpadd_s32(t1, t1), 0);
1242 }
1243
1244 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1245 inline scalartype v_reduce_##func(const _Tpvec& a) \
1246 { \
1247     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1248     a0 = vp##vectorfunc##_##suffix(a0, a0); \
1249     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1250 }
1251
1252 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16)
1253 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16)
1254 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16)
1255 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16)
1256
1257 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1258 inline scalartype v_reduce_##func(const _Tpvec& a) \
1259 { \
1260     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1261     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1262 }
1263
1264 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
1265 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
1266 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
1267 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
1268 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
1269 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
1270 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
1271 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
1272 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
1273
1274 inline uint64 v_reduce_sum(const v_uint64x2& a)
1275 { return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
1276 inline int64 v_reduce_sum(const v_int64x2& a)
1277 { return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
1278 #if CV_SIMD128_64F
1279 inline double v_reduce_sum(const v_float64x2& a)
1280 {
1281     return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
1282 }
1283 #endif
1284
1285 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1286                                  const v_float32x4& c, const v_float32x4& d)
1287 {
1288     float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1289     float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1290
1291     float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
1292     float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
1293
1294     float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1295     float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1296
1297     return v_float32x4(vaddq_f32(v0, v1));
1298 }
1299
1300 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1301 {
1302     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1303     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1304     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1305 }
1306 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1307 {
1308     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1309     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1310     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1311 }
1312 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1313 {
1314     uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1315     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1316     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1317 }
1318 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1319 {
1320     uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1321     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1322     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1323 }
1324 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1325 {
1326     uint32x4_t t0 = vabdq_u32(a.val, b.val);
1327     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1328     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1329 }
1330 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1331 {
1332     uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1333     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1334     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1335 }
1336 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1337 {
1338     float32x4_t t0 = vabdq_f32(a.val, b.val);
1339     float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1340     return vget_lane_f32(vpadd_f32(t1, t1), 0);
1341 }
1342
1343 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1344 { return v_uint8x16(vcntq_u8(a.val)); }
1345 inline v_uint8x16 v_popcount(const v_int8x16& a)
1346 { return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
1347 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1348 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
1349 inline v_uint16x8 v_popcount(const v_int16x8& a)
1350 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
1351 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1352 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
1353 inline v_uint32x4 v_popcount(const v_int32x4& a)
1354 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
1355 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1356 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
1357 inline v_uint64x2 v_popcount(const v_int64x2& a)
1358 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1359
1360 inline int v_signmask(const v_uint8x16& a)
1361 {
1362     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1363     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1364     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1365     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1366 }
1367 inline int v_signmask(const v_int8x16& a)
1368 { return v_signmask(v_reinterpret_as_u8(a)); }
1369
1370 inline int v_signmask(const v_uint16x8& a)
1371 {
1372     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1373     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1374     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1375     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1376 }
1377 inline int v_signmask(const v_int16x8& a)
1378 { return v_signmask(v_reinterpret_as_u16(a)); }
1379
1380 inline int v_signmask(const v_uint32x4& a)
1381 {
1382     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1383     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1384     uint64x2_t v1 = vpaddlq_u32(v0);
1385     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1386 }
1387 inline int v_signmask(const v_int32x4& a)
1388 { return v_signmask(v_reinterpret_as_u32(a)); }
1389 inline int v_signmask(const v_float32x4& a)
1390 { return v_signmask(v_reinterpret_as_u32(a)); }
1391 inline int v_signmask(const v_uint64x2& a)
1392 {
1393     int64x1_t m0 = vdup_n_s64(0);
1394     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1395     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1396 }
1397 inline int v_signmask(const v_int64x2& a)
1398 { return v_signmask(v_reinterpret_as_u64(a)); }
1399 #if CV_SIMD128_64F
1400 inline int v_signmask(const v_float64x2& a)
1401 { return v_signmask(v_reinterpret_as_u64(a)); }
1402 #endif
1403
1404 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1405 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1406 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1407 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1408 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1409 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1410 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1411 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1412 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1413 #if CV_SIMD128_64F
1414 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1415 #endif
1416
1417 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1418 inline bool v_check_all(const v_##_Tpvec& a) \
1419 { \
1420     _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1421     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1422     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1423 } \
1424 inline bool v_check_any(const v_##_Tpvec& a) \
1425 { \
1426     _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1427     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1428     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1429 }
1430
1431 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1432 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1433 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1434
1435 inline bool v_check_all(const v_uint64x2& a)
1436 {
1437     uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1438     return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1439 }
1440 inline bool v_check_any(const v_uint64x2& a)
1441 {
1442     uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1443     return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1444 }
1445
1446 inline bool v_check_all(const v_int8x16& a)
1447 { return v_check_all(v_reinterpret_as_u8(a)); }
1448 inline bool v_check_all(const v_int16x8& a)
1449 { return v_check_all(v_reinterpret_as_u16(a)); }
1450 inline bool v_check_all(const v_int32x4& a)
1451 { return v_check_all(v_reinterpret_as_u32(a)); }
1452 inline bool v_check_all(const v_float32x4& a)
1453 { return v_check_all(v_reinterpret_as_u32(a)); }
1454
1455 inline bool v_check_any(const v_int8x16& a)
1456 { return v_check_any(v_reinterpret_as_u8(a)); }
1457 inline bool v_check_any(const v_int16x8& a)
1458 { return v_check_any(v_reinterpret_as_u16(a)); }
1459 inline bool v_check_any(const v_int32x4& a)
1460 { return v_check_any(v_reinterpret_as_u32(a)); }
1461 inline bool v_check_any(const v_float32x4& a)
1462 { return v_check_any(v_reinterpret_as_u32(a)); }
1463
1464 inline bool v_check_all(const v_int64x2& a)
1465 { return v_check_all(v_reinterpret_as_u64(a)); }
1466 inline bool v_check_any(const v_int64x2& a)
1467 { return v_check_any(v_reinterpret_as_u64(a)); }
1468 #if CV_SIMD128_64F
1469 inline bool v_check_all(const v_float64x2& a)
1470 { return v_check_all(v_reinterpret_as_u64(a)); }
1471 inline bool v_check_any(const v_float64x2& a)
1472 { return v_check_any(v_reinterpret_as_u64(a)); }
1473 #endif
1474
1475 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1476 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1477 { \
1478     return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1479 }
1480
1481 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1482 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1483 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1484 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1485 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1486 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1487 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1488 #if CV_SIMD128_64F
1489 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1490 #endif
1491
1492 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1493 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1494 { \
1495     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1496     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1497 } \
1498 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1499 { \
1500     return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1501 } \
1502 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1503 { \
1504     return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1505 } \
1506 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1507 { \
1508     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1509 }
1510
1511 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1512 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1513 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1514 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1515 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1516 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1517
1518 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1519 {
1520     typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1521     uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1522     uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1523     return v_uint32x4(vmovl_u16(v1));
1524 }
1525
1526 inline v_int32x4 v_load_expand_q(const schar* ptr)
1527 {
1528     typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1529     int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1530     int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1531     return v_int32x4(vmovl_s16(v1));
1532 }
1533
1534 #if defined(__aarch64__)
1535 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1536 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1537 { \
1538     b0.val = vzip1q_##suffix(a0.val, a1.val); \
1539     b1.val = vzip2q_##suffix(a0.val, a1.val); \
1540 } \
1541 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1542 { \
1543     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1544 } \
1545 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1546 { \
1547     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1548 } \
1549 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1550 { \
1551     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1552     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1553 }
1554 #else
1555 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1556 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1557 { \
1558     _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1559     b0.val = p.val[0]; \
1560     b1.val = p.val[1]; \
1561 } \
1562 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1563 { \
1564     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1565 } \
1566 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1567 { \
1568     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1569 } \
1570 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1571 { \
1572     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1573     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1574 }
1575 #endif
1576
1577 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1578 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1579 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1580 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1581 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1582 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1583 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1584 #if CV_SIMD128_64F
1585 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1586 #endif
1587
1588 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1589 {
1590     uint8x16_t vec = vrev64q_u8(a.val);
1591     return v_uint8x16(vextq_u8(vec, vec, 8));
1592 }
1593
1594 inline v_int8x16 v_reverse(const v_int8x16 &a)
1595 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1596
1597 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1598 {
1599     uint16x8_t vec = vrev64q_u16(a.val);
1600     return v_uint16x8(vextq_u16(vec, vec, 4));
1601 }
1602
1603 inline v_int16x8 v_reverse(const v_int16x8 &a)
1604 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1605
1606 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1607 {
1608     uint32x4_t vec = vrev64q_u32(a.val);
1609     return v_uint32x4(vextq_u32(vec, vec, 2));
1610 }
1611
1612 inline v_int32x4 v_reverse(const v_int32x4 &a)
1613 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1614
1615 inline v_float32x4 v_reverse(const v_float32x4 &a)
1616 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1617
1618 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1619 {
1620     uint64x2_t vec = a.val;
1621     uint64x1_t vec_lo = vget_low_u64(vec);
1622     uint64x1_t vec_hi = vget_high_u64(vec);
1623     return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1624 }
1625
1626 inline v_int64x2 v_reverse(const v_int64x2 &a)
1627 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1628
1629 #if CV_SIMD128_64F
1630 inline v_float64x2 v_reverse(const v_float64x2 &a)
1631 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1632 #endif
1633
1634 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1635 template <int s> \
1636 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1637 { \
1638     return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1639 }
1640
1641 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1642 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1643 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1644 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1645 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1646 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1647 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1648 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1649 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1650 #if CV_SIMD128_64F
1651 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1652 #endif
1653
1654 #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1655 template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1656
1657 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
1658 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
1659 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
1660 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
1661 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
1662 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
1663 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
1664 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
1665 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
1666 #if CV_SIMD128_64F
1667 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
1668 #endif
1669
1670 #define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1671 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1672
1673 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
1674 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
1675 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
1676 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
1677 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
1678 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
1679 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
1680 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
1681 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
1682 #if CV_SIMD128_64F
1683 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
1684 #endif
1685
1686 #if CV_SIMD128_64F
1687 inline v_int32x4 v_round(const v_float32x4& a)
1688 {
1689     float32x4_t a_ = a.val;
1690     int32x4_t result;
1691     __asm__ ("fcvtns %0.4s, %1.4s"
1692              : "=w"(result)
1693              : "w"(a_)
1694              : /* No clobbers */);
1695     return v_int32x4(result);
1696 }
1697 #else
1698 inline v_int32x4 v_round(const v_float32x4& a)
1699 {
1700     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1701         v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1702
1703     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1704     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1705 }
1706 #endif
1707 inline v_int32x4 v_floor(const v_float32x4& a)
1708 {
1709     int32x4_t a1 = vcvtq_s32_f32(a.val);
1710     uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1711     return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1712 }
1713
1714 inline v_int32x4 v_ceil(const v_float32x4& a)
1715 {
1716     int32x4_t a1 = vcvtq_s32_f32(a.val);
1717     uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1718     return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1719 }
1720
1721 inline v_int32x4 v_trunc(const v_float32x4& a)
1722 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1723
1724 #if CV_SIMD128_64F
1725 inline v_int32x4 v_round(const v_float64x2& a)
1726 {
1727     static const int32x2_t zero = vdup_n_s32(0);
1728     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1729 }
1730
1731 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1732 {
1733     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
1734 }
1735
1736 inline v_int32x4 v_floor(const v_float64x2& a)
1737 {
1738     static const int32x2_t zero = vdup_n_s32(0);
1739     int64x2_t a1 = vcvtq_s64_f64(a.val);
1740     uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1741     a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1742     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1743 }
1744
1745 inline v_int32x4 v_ceil(const v_float64x2& a)
1746 {
1747     static const int32x2_t zero = vdup_n_s32(0);
1748     int64x2_t a1 = vcvtq_s64_f64(a.val);
1749     uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1750     a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
1751     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1752 }
1753
1754 inline v_int32x4 v_trunc(const v_float64x2& a)
1755 {
1756     static const int32x2_t zero = vdup_n_s32(0);
1757     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1758 }
1759 #endif
1760
1761 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
1762 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1763                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1764                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
1765                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
1766 { \
1767     /* m00 m01 m02 m03 */ \
1768     /* m10 m11 m12 m13 */ \
1769     /* m20 m21 m22 m23 */ \
1770     /* m30 m31 m32 m33 */ \
1771     _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
1772     _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
1773     /* m00 m10 m02 m12 */ \
1774     /* m01 m11 m03 m13 */ \
1775     /* m20 m30 m22 m32 */ \
1776     /* m21 m31 m23 m33 */ \
1777     b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
1778     b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
1779     b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
1780     b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
1781 }
1782
1783 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
1784 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
1785 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
1786
1787 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
1788 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1789 { \
1790     _Tpvec##x2_t v = vld2q_##suffix(ptr); \
1791     a.val = v.val[0]; \
1792     b.val = v.val[1]; \
1793 } \
1794 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1795 { \
1796     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
1797     a.val = v.val[0]; \
1798     b.val = v.val[1]; \
1799     c.val = v.val[2]; \
1800 } \
1801 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1802                                 v_##_Tpvec& c, v_##_Tpvec& d) \
1803 { \
1804     _Tpvec##x4_t v = vld4q_##suffix(ptr); \
1805     a.val = v.val[0]; \
1806     b.val = v.val[1]; \
1807     c.val = v.val[2]; \
1808     d.val = v.val[3]; \
1809 } \
1810 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1811                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1812 { \
1813     _Tpvec##x2_t v; \
1814     v.val[0] = a.val; \
1815     v.val[1] = b.val; \
1816     vst2q_##suffix(ptr, v); \
1817 } \
1818 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1819                                 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1820 { \
1821     _Tpvec##x3_t v; \
1822     v.val[0] = a.val; \
1823     v.val[1] = b.val; \
1824     v.val[2] = c.val; \
1825     vst3q_##suffix(ptr, v); \
1826 } \
1827 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1828                                 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1829                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1830 { \
1831     _Tpvec##x4_t v; \
1832     v.val[0] = a.val; \
1833     v.val[1] = b.val; \
1834     v.val[2] = c.val; \
1835     v.val[3] = d.val; \
1836     vst4q_##suffix(ptr, v); \
1837 }
1838
1839 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
1840 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
1841 { \
1842     tp##x1_t a0 = vld1_##suffix(ptr); \
1843     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1844     tp##x1_t a1 = vld1_##suffix(ptr + 2); \
1845     tp##x1_t b1 = vld1_##suffix(ptr + 3); \
1846     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1847     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1848 } \
1849  \
1850 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
1851                                  v_##tp##x2& b, v_##tp##x2& c ) \
1852 { \
1853     tp##x1_t a0 = vld1_##suffix(ptr); \
1854     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1855     tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1856     tp##x1_t a1 = vld1_##suffix(ptr + 3); \
1857     tp##x1_t b1 = vld1_##suffix(ptr + 4); \
1858     tp##x1_t c1 = vld1_##suffix(ptr + 5); \
1859     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1860     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1861     c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1862 } \
1863  \
1864 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
1865                                  v_##tp##x2& c, v_##tp##x2& d ) \
1866 { \
1867     tp##x1_t a0 = vld1_##suffix(ptr); \
1868     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1869     tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1870     tp##x1_t d0 = vld1_##suffix(ptr + 3); \
1871     tp##x1_t a1 = vld1_##suffix(ptr + 4); \
1872     tp##x1_t b1 = vld1_##suffix(ptr + 5); \
1873     tp##x1_t c1 = vld1_##suffix(ptr + 6); \
1874     tp##x1_t d1 = vld1_##suffix(ptr + 7); \
1875     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1876     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1877     c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1878     d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
1879 } \
1880  \
1881 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1882                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1883 { \
1884     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1885     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1886     vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
1887     vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
1888 } \
1889  \
1890 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
1891                                 const v_##tp##x2& b, const v_##tp##x2& c, \
1892                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1893 { \
1894     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1895     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1896     vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1897     vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
1898     vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
1899     vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
1900 } \
1901  \
1902 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1903                                 const v_##tp##x2& c, const v_##tp##x2& d, \
1904                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1905 { \
1906     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1907     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1908     vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1909     vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
1910     vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
1911     vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
1912     vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
1913     vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
1914 }
1915
1916 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
1917 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
1918 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
1919 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
1920 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
1921 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
1922 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
1923 #if CV_SIMD128_64F
1924 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
1925 #endif
1926
1927 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
1928 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
1929
1930 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1931 {
1932     return v_float32x4(vcvtq_f32_s32(a.val));
1933 }
1934
1935 #if CV_SIMD128_64F
1936 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1937 {
1938     float32x2_t zero = vdup_n_f32(0.0f);
1939     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
1940 }
1941
1942 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1943 {
1944     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
1945 }
1946
1947 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1948 {
1949     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
1950 }
1951
1952 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1953 {
1954     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
1955 }
1956
1957 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1958 {
1959     return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
1960 }
1961
1962 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1963 {
1964     return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
1965 }
1966
1967 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1968 {  return v_float64x2(vcvtq_f64_s64(a.val)); }
1969
1970 #endif
1971
1972 ////////////// Lookup table access ////////////////////
1973
1974 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1975 {
1976     schar CV_DECL_ALIGNED(32) elems[16] =
1977     {
1978         tab[idx[ 0]],
1979         tab[idx[ 1]],
1980         tab[idx[ 2]],
1981         tab[idx[ 3]],
1982         tab[idx[ 4]],
1983         tab[idx[ 5]],
1984         tab[idx[ 6]],
1985         tab[idx[ 7]],
1986         tab[idx[ 8]],
1987         tab[idx[ 9]],
1988         tab[idx[10]],
1989         tab[idx[11]],
1990         tab[idx[12]],
1991         tab[idx[13]],
1992         tab[idx[14]],
1993         tab[idx[15]]
1994     };
1995     return v_int8x16(vld1q_s8(elems));
1996 }
1997 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1998 {
1999     schar CV_DECL_ALIGNED(32) elems[16] =
2000     {
2001         tab[idx[0]],
2002         tab[idx[0] + 1],
2003         tab[idx[1]],
2004         tab[idx[1] + 1],
2005         tab[idx[2]],
2006         tab[idx[2] + 1],
2007         tab[idx[3]],
2008         tab[idx[3] + 1],
2009         tab[idx[4]],
2010         tab[idx[4] + 1],
2011         tab[idx[5]],
2012         tab[idx[5] + 1],
2013         tab[idx[6]],
2014         tab[idx[6] + 1],
2015         tab[idx[7]],
2016         tab[idx[7] + 1]
2017     };
2018     return v_int8x16(vld1q_s8(elems));
2019 }
2020 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2021 {
2022     schar CV_DECL_ALIGNED(32) elems[16] =
2023     {
2024         tab[idx[0]],
2025         tab[idx[0] + 1],
2026         tab[idx[0] + 2],
2027         tab[idx[0] + 3],
2028         tab[idx[1]],
2029         tab[idx[1] + 1],
2030         tab[idx[1] + 2],
2031         tab[idx[1] + 3],
2032         tab[idx[2]],
2033         tab[idx[2] + 1],
2034         tab[idx[2] + 2],
2035         tab[idx[2] + 3],
2036         tab[idx[3]],
2037         tab[idx[3] + 1],
2038         tab[idx[3] + 2],
2039         tab[idx[3] + 3]
2040     };
2041     return v_int8x16(vld1q_s8(elems));
2042 }
2043 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
2044 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
2045 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2046
2047 inline v_int16x8 v_lut(const short* tab, const int* idx)
2048 {
2049     short CV_DECL_ALIGNED(32) elems[8] =
2050     {
2051         tab[idx[0]],
2052         tab[idx[1]],
2053         tab[idx[2]],
2054         tab[idx[3]],
2055         tab[idx[4]],
2056         tab[idx[5]],
2057         tab[idx[6]],
2058         tab[idx[7]]
2059     };
2060     return v_int16x8(vld1q_s16(elems));
2061 }
2062 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2063 {
2064     short CV_DECL_ALIGNED(32) elems[8] =
2065     {
2066         tab[idx[0]],
2067         tab[idx[0] + 1],
2068         tab[idx[1]],
2069         tab[idx[1] + 1],
2070         tab[idx[2]],
2071         tab[idx[2] + 1],
2072         tab[idx[3]],
2073         tab[idx[3] + 1]
2074     };
2075     return v_int16x8(vld1q_s16(elems));
2076 }
2077 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2078 {
2079     return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2080 }
2081 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
2082 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
2083 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
2084
2085 inline v_int32x4 v_lut(const int* tab, const int* idx)
2086 {
2087     int CV_DECL_ALIGNED(32) elems[4] =
2088     {
2089         tab[idx[0]],
2090         tab[idx[1]],
2091         tab[idx[2]],
2092         tab[idx[3]]
2093     };
2094     return v_int32x4(vld1q_s32(elems));
2095 }
2096 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2097 {
2098     return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2099 }
2100 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2101 {
2102     return v_int32x4(vld1q_s32(tab + idx[0]));
2103 }
2104 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
2105 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
2106 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
2107
2108 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2109 {
2110     return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2111 }
2112 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2113 {
2114     return v_int64x2(vld1q_s64(tab + idx[0]));
2115 }
2116 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2117 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2118
2119 inline v_float32x4 v_lut(const float* tab, const int* idx)
2120 {
2121     float CV_DECL_ALIGNED(32) elems[4] =
2122     {
2123         tab[idx[0]],
2124         tab[idx[1]],
2125         tab[idx[2]],
2126         tab[idx[3]]
2127     };
2128     return v_float32x4(vld1q_f32(elems));
2129 }
2130 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
2131 {
2132     uint64 CV_DECL_ALIGNED(32) elems[2] =
2133     {
2134         *(uint64*)(tab + idx[0]),
2135         *(uint64*)(tab + idx[1])
2136     };
2137     return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2138 }
2139 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
2140 {
2141     return v_float32x4(vld1q_f32(tab + idx[0]));
2142 }
2143
2144 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2145 {
2146     int CV_DECL_ALIGNED(32) elems[4] =
2147     {
2148         tab[vgetq_lane_s32(idxvec.val, 0)],
2149         tab[vgetq_lane_s32(idxvec.val, 1)],
2150         tab[vgetq_lane_s32(idxvec.val, 2)],
2151         tab[vgetq_lane_s32(idxvec.val, 3)]
2152     };
2153     return v_int32x4(vld1q_s32(elems));
2154 }
2155
2156 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2157 {
2158     unsigned CV_DECL_ALIGNED(32) elems[4] =
2159     {
2160         tab[vgetq_lane_s32(idxvec.val, 0)],
2161         tab[vgetq_lane_s32(idxvec.val, 1)],
2162         tab[vgetq_lane_s32(idxvec.val, 2)],
2163         tab[vgetq_lane_s32(idxvec.val, 3)]
2164     };
2165     return v_uint32x4(vld1q_u32(elems));
2166 }
2167
2168 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2169 {
2170     float CV_DECL_ALIGNED(32) elems[4] =
2171     {
2172         tab[vgetq_lane_s32(idxvec.val, 0)],
2173         tab[vgetq_lane_s32(idxvec.val, 1)],
2174         tab[vgetq_lane_s32(idxvec.val, 2)],
2175         tab[vgetq_lane_s32(idxvec.val, 3)]
2176     };
2177     return v_float32x4(vld1q_f32(elems));
2178 }
2179
2180 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2181 {
2182     /*int CV_DECL_ALIGNED(32) idx[4];
2183     v_store(idx, idxvec);
2184
2185     float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
2186     float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
2187
2188     float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
2189     x = v_float32x4(xxyy.val[0]);
2190     y = v_float32x4(xxyy.val[1]);*/
2191     int CV_DECL_ALIGNED(32) idx[4];
2192     v_store_aligned(idx, idxvec);
2193
2194     x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2195     y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2196 }
2197
2198 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2199 {
2200     return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2201 }
2202 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2203 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2204 {
2205     return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2206 }
2207 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2208
2209 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2210 {
2211     return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2212 }
2213 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2214 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2215 {
2216     int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2217     return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2218 }
2219 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2220
2221 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2222 {
2223     int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2224     return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2225 }
2226 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2227 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2228
2229 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2230 {
2231     return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2232 }
2233 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2234
2235 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2236 {
2237     return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2238 }
2239 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2240
2241 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2242 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2243 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2244
2245 #if CV_SIMD128_64F
2246 inline v_float64x2 v_lut(const double* tab, const int* idx)
2247 {
2248     double CV_DECL_ALIGNED(32) elems[2] =
2249     {
2250         tab[idx[0]],
2251         tab[idx[1]]
2252     };
2253     return v_float64x2(vld1q_f64(elems));
2254 }
2255
2256 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2257 {
2258     return v_float64x2(vld1q_f64(tab + idx[0]));
2259 }
2260
2261 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2262 {
2263     double CV_DECL_ALIGNED(32) elems[2] =
2264     {
2265         tab[vgetq_lane_s32(idxvec.val, 0)],
2266         tab[vgetq_lane_s32(idxvec.val, 1)],
2267     };
2268     return v_float64x2(vld1q_f64(elems));
2269 }
2270
2271 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2272 {
2273     int CV_DECL_ALIGNED(32) idx[4];
2274     v_store_aligned(idx, idxvec);
2275
2276     x = v_float64x2(tab[idx[0]], tab[idx[1]]);
2277     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
2278 }
2279 #endif
2280
2281 ////// FP16 support ///////
2282 #if CV_FP16
2283 inline v_float32x4 v_load_expand(const float16_t* ptr)
2284 {
2285     float16x4_t v =
2286     #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
2287         (float16x4_t)vld1_s16((const short*)ptr);
2288     #else
2289         vld1_f16((const __fp16*)ptr);
2290     #endif
2291     return v_float32x4(vcvt_f32_f16(v));
2292 }
2293
2294 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2295 {
2296     float16x4_t hv = vcvt_f16_f32(v.val);
2297
2298     #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
2299         vst1_s16((short*)ptr, (int16x4_t)hv);
2300     #else
2301         vst1_f16((__fp16*)ptr, hv);
2302     #endif
2303 }
2304 #else
2305 inline v_float32x4 v_load_expand(const float16_t* ptr)
2306 {
2307     const int N = 4;
2308     float buf[N];
2309     for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2310     return v_load(buf);
2311 }
2312
2313 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2314 {
2315     const int N = 4;
2316     float buf[N];
2317     v_store(buf, v);
2318     for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2319 }
2320 #endif
2321
2322 inline void v_cleanup() {}
2323
2324 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2325
2326 //! @endcond
2327
2328 }
2329
2330 #endif