further improvements in split & merge; started using non-temporary store instructions...
[platform/upstream/opencv.git] / modules / core / include / opencv2 / core / hal / intrin_neon.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
47
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50
51 namespace cv
52 {
53
54 //! @cond IGNORED
55
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
57
58 #define CV_SIMD128 1
59 #if defined(__aarch64__)
60 #define CV_SIMD128_64F 1
61 #else
62 #define CV_SIMD128_64F 0
63 #endif
64
65 #if CV_SIMD128_64F
66 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
67 template <typename T> static inline \
68 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
69 template <typename T> static inline \
70 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
71 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
72 OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
73 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
74 OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
75 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
76 OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
77 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
78 OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
79 OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
80 #endif
81
82 struct v_uint8x16
83 {
84     typedef uchar lane_type;
85     enum { nlanes = 16 };
86
87     v_uint8x16() {}
88     explicit v_uint8x16(uint8x16_t v) : val(v) {}
89     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
90                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
91     {
92         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
93         val = vld1q_u8(v);
94     }
95     uchar get0() const
96     {
97         return vgetq_lane_u8(val, 0);
98     }
99
100     uint8x16_t val;
101 };
102
103 struct v_int8x16
104 {
105     typedef schar lane_type;
106     enum { nlanes = 16 };
107
108     v_int8x16() {}
109     explicit v_int8x16(int8x16_t v) : val(v) {}
110     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
111                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
112     {
113         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
114         val = vld1q_s8(v);
115     }
116     schar get0() const
117     {
118         return vgetq_lane_s8(val, 0);
119     }
120
121     int8x16_t val;
122 };
123
124 struct v_uint16x8
125 {
126     typedef ushort lane_type;
127     enum { nlanes = 8 };
128
129     v_uint16x8() {}
130     explicit v_uint16x8(uint16x8_t v) : val(v) {}
131     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
132     {
133         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
134         val = vld1q_u16(v);
135     }
136     ushort get0() const
137     {
138         return vgetq_lane_u16(val, 0);
139     }
140
141     uint16x8_t val;
142 };
143
144 struct v_int16x8
145 {
146     typedef short lane_type;
147     enum { nlanes = 8 };
148
149     v_int16x8() {}
150     explicit v_int16x8(int16x8_t v) : val(v) {}
151     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
152     {
153         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
154         val = vld1q_s16(v);
155     }
156     short get0() const
157     {
158         return vgetq_lane_s16(val, 0);
159     }
160
161     int16x8_t val;
162 };
163
164 struct v_uint32x4
165 {
166     typedef unsigned lane_type;
167     enum { nlanes = 4 };
168
169     v_uint32x4() {}
170     explicit v_uint32x4(uint32x4_t v) : val(v) {}
171     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
172     {
173         unsigned v[] = {v0, v1, v2, v3};
174         val = vld1q_u32(v);
175     }
176     unsigned get0() const
177     {
178         return vgetq_lane_u32(val, 0);
179     }
180
181     uint32x4_t val;
182 };
183
184 struct v_int32x4
185 {
186     typedef int lane_type;
187     enum { nlanes = 4 };
188
189     v_int32x4() {}
190     explicit v_int32x4(int32x4_t v) : val(v) {}
191     v_int32x4(int v0, int v1, int v2, int v3)
192     {
193         int v[] = {v0, v1, v2, v3};
194         val = vld1q_s32(v);
195     }
196     int get0() const
197     {
198         return vgetq_lane_s32(val, 0);
199     }
200     int32x4_t val;
201 };
202
203 struct v_float32x4
204 {
205     typedef float lane_type;
206     enum { nlanes = 4 };
207
208     v_float32x4() {}
209     explicit v_float32x4(float32x4_t v) : val(v) {}
210     v_float32x4(float v0, float v1, float v2, float v3)
211     {
212         float v[] = {v0, v1, v2, v3};
213         val = vld1q_f32(v);
214     }
215     float get0() const
216     {
217         return vgetq_lane_f32(val, 0);
218     }
219     float32x4_t val;
220 };
221
222 struct v_uint64x2
223 {
224     typedef uint64 lane_type;
225     enum { nlanes = 2 };
226
227     v_uint64x2() {}
228     explicit v_uint64x2(uint64x2_t v) : val(v) {}
229     v_uint64x2(uint64 v0, uint64 v1)
230     {
231         uint64 v[] = {v0, v1};
232         val = vld1q_u64(v);
233     }
234     uint64 get0() const
235     {
236         return vgetq_lane_u64(val, 0);
237     }
238     uint64x2_t val;
239 };
240
241 struct v_int64x2
242 {
243     typedef int64 lane_type;
244     enum { nlanes = 2 };
245
246     v_int64x2() {}
247     explicit v_int64x2(int64x2_t v) : val(v) {}
248     v_int64x2(int64 v0, int64 v1)
249     {
250         int64 v[] = {v0, v1};
251         val = vld1q_s64(v);
252     }
253     int64 get0() const
254     {
255         return vgetq_lane_s64(val, 0);
256     }
257     int64x2_t val;
258 };
259
260 #if CV_SIMD128_64F
261 struct v_float64x2
262 {
263     typedef double lane_type;
264     enum { nlanes = 2 };
265
266     v_float64x2() {}
267     explicit v_float64x2(float64x2_t v) : val(v) {}
268     v_float64x2(double v0, double v1)
269     {
270         double v[] = {v0, v1};
271         val = vld1q_f64(v);
272     }
273     double get0() const
274     {
275         return vgetq_lane_f64(val, 0);
276     }
277     float64x2_t val;
278 };
279 #endif
280
281 #if CV_FP16
282 // Workaround for old compilers
283 static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
284 static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
285 static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
286 static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
287
288 static inline float16x8_t cv_vld1q_f16(const void* ptr)
289 {
290 #ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
291     return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
292 #else
293     return vld1q_f16((const __fp16*)ptr);
294 #endif
295 }
296 static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
297 {
298 #ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
299     vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
300 #else
301     vst1q_f16((__fp16*)ptr, a);
302 #endif
303 }
304
305 static inline float16x4_t cv_vld1_f16(const void* ptr)
306 {
307 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
308     return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
309 #else
310     return vld1_f16((const __fp16*)ptr);
311 #endif
312 }
313 static inline void cv_vst1_f16(void* ptr, float16x4_t a)
314 {
315 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
316     vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
317 #else
318     vst1_f16((__fp16*)ptr, a);
319 #endif
320 }
321
322
323 struct v_float16x8
324 {
325     typedef short lane_type;
326     enum { nlanes = 8 };
327
328     v_float16x8() {}
329     explicit v_float16x8(float16x8_t v) : val(v) {}
330     v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
331     {
332         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
333         val = cv_vld1q_f16(v);
334     }
335     short get0() const
336     {
337         return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
338     }
339     float16x8_t val;
340 };
341
342 inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
343 inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
344 #endif
345
346 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
347 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
348 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
349 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
350 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
351 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
352 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
353 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
354 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
355 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
356 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
357 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
358 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
359
360 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
361 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
362 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
363 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
364 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
365 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
366 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
367 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
368 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
369 #if CV_SIMD128_64F
370 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
371 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
372 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
373 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
374 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
375 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
376 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
377 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
378 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
379 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
380 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
381 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
382 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
383 #endif
384
385 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
386 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
387 { \
388     hreg a1 = mov(a.val), b1 = mov(b.val); \
389     return _Tpvec(vcombine_##suffix(a1, b1)); \
390 } \
391 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
392 { \
393     hreg a1 = mov(a.val); \
394     vst1_##suffix(ptr, a1); \
395 } \
396 template<int n> inline \
397 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
398 { \
399     hreg a1 = rshr(a.val, n); \
400     hreg b1 = rshr(b.val, n); \
401     return _Tpvec(vcombine_##suffix(a1, b1)); \
402 } \
403 template<int n> inline \
404 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
405 { \
406     hreg a1 = rshr(a.val, n); \
407     vst1_##suffix(ptr, a1); \
408 }
409
410 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
411 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
412 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
413 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
414 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
415 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
416
417 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
418 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
419
420 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
421                             const v_float32x4& m1, const v_float32x4& m2,
422                             const v_float32x4& m3)
423 {
424     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
425     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
426     res = vmlaq_lane_f32(res, m1.val, vl, 1);
427     res = vmlaq_lane_f32(res, m2.val, vh, 0);
428     res = vmlaq_lane_f32(res, m3.val, vh, 1);
429     return v_float32x4(res);
430 }
431
432 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
433                                const v_float32x4& m1, const v_float32x4& m2,
434                                const v_float32x4& a)
435 {
436     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
437     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
438     res = vmlaq_lane_f32(res, m1.val, vl, 1);
439     res = vmlaq_lane_f32(res, m2.val, vh, 0);
440     res = vaddq_f32(res, a.val);
441     return v_float32x4(res);
442 }
443
444 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
445 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
446 { \
447     return _Tpvec(intrin(a.val, b.val)); \
448 } \
449 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
450 { \
451     a.val = intrin(a.val, b.val); \
452     return a; \
453 }
454
455 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
456 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
457 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
458 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
459 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
460 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
461 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
462 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
463 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
464 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
465 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
466 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
467 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
468 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
469 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
470 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
471 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
472 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
473 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
474 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
475 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
476 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
477 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
478 #if CV_SIMD128_64F
479 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
480 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
481 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
482 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
483 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
484 #else
485 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
486 {
487     float32x4_t reciprocal = vrecpeq_f32(b.val);
488     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
489     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
490     return v_float32x4(vmulq_f32(a.val, reciprocal));
491 }
492 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
493 {
494     float32x4_t reciprocal = vrecpeq_f32(b.val);
495     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
496     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
497     a.val = vmulq_f32(a.val, reciprocal);
498     return a;
499 }
500 #endif
501
502 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
503                          v_int32x4& c, v_int32x4& d)
504 {
505     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
506     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
507 }
508
509 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
510                          v_uint32x4& c, v_uint32x4& d)
511 {
512     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
513     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
514 }
515
516 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
517                          v_uint64x2& c, v_uint64x2& d)
518 {
519     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
520     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
521 }
522
523 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
524 {
525     int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
526     int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
527     int32x4x2_t cd = vuzpq_s32(c, d);
528     return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
529 }
530
531 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
532 {
533     v_int32x4 s = v_dotprod(a, b);
534     return v_int32x4(vaddq_s32(s.val , c.val));
535 }
536
537 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
538     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
539     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
540     OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
541     inline _Tpvec operator ~ (const _Tpvec& a) \
542     { \
543         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
544     }
545
546 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
547 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
548 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
549 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
550 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
551 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
552 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
553 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
554
555 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
556 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
557 { \
558     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
559 } \
560 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
561 { \
562     a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
563     return a; \
564 }
565
566 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
567 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
568 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
569
570 inline v_float32x4 operator ~ (const v_float32x4& a)
571 {
572     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
573 }
574
575 #if CV_SIMD128_64F
576 inline v_float32x4 v_sqrt(const v_float32x4& x)
577 {
578     return v_float32x4(vsqrtq_f32(x.val));
579 }
580
581 inline v_float32x4 v_invsqrt(const v_float32x4& x)
582 {
583     v_float32x4 one = v_setall_f32(1.0f);
584     return one / v_sqrt(x);
585 }
586 #else
587 inline v_float32x4 v_sqrt(const v_float32x4& x)
588 {
589     float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
590     float32x4_t e = vrsqrteq_f32(x1);
591     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
592     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
593     return v_float32x4(vmulq_f32(x.val, e));
594 }
595
596 inline v_float32x4 v_invsqrt(const v_float32x4& x)
597 {
598     float32x4_t e = vrsqrteq_f32(x.val);
599     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
600     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
601     return v_float32x4(e);
602 }
603 #endif
604
605 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
606 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
607
608 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
609 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
610 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
611
612 inline v_float32x4 v_abs(v_float32x4 x)
613 { return v_float32x4(vabsq_f32(x.val)); }
614
615 #if CV_SIMD128_64F
616 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
617 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
618 { \
619     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
620 } \
621 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
622 { \
623     a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
624     return a; \
625 }
626
627 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
628 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
629 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
630
631 inline v_float64x2 operator ~ (const v_float64x2& a)
632 {
633     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
634 }
635
636 inline v_float64x2 v_sqrt(const v_float64x2& x)
637 {
638     return v_float64x2(vsqrtq_f64(x.val));
639 }
640
641 inline v_float64x2 v_invsqrt(const v_float64x2& x)
642 {
643     v_float64x2 one = v_setall_f64(1.0f);
644     return one / v_sqrt(x);
645 }
646
647 inline v_float64x2 v_abs(v_float64x2 x)
648 { return v_float64x2(vabsq_f64(x.val)); }
649 #endif
650
651 // TODO: exp, log, sin, cos
652
653 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
654 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
655 { \
656     return _Tpvec(intrin(a.val, b.val)); \
657 }
658
659 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
660 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
661 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
662 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
663 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
664 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
665 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
666 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
667 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
668 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
669 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
670 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
671 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
672 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
673 #if CV_SIMD128_64F
674 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
675 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
676 #endif
677
678 #if CV_SIMD128_64F
679 inline int64x2_t vmvnq_s64(int64x2_t a)
680 {
681     int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
682     return veorq_s64(a, vx);
683 }
684 inline uint64x2_t vmvnq_u64(uint64x2_t a)
685 {
686     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
687     return veorq_u64(a, vx);
688 }
689 #endif
690 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
691 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
692 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
693 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
694 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
695 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
696 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
697 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
698 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
699 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
700 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
701 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
702 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
703
704 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
705 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
706 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
707 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
708 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
709 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
710 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
711 #if CV_SIMD128_64F
712 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
713 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
714 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
715 #endif
716
717 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
718 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
719 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
720 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
721 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
722 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
723 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
724 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
725
726 // TODO: absdiff for signed integers
727 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
728 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
729 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
730 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
731 #if CV_SIMD128_64F
732 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
733 #endif
734
735 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
736 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
737 { \
738     return _Tpvec2(cast(intrin(a.val, b.val))); \
739 }
740
741 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
742 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
743 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
744
745 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
746 {
747     v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
748     return v_sqrt(x);
749 }
750
751 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
752 {
753     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
754 }
755
756 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
757 {
758 #if CV_SIMD128_64F
759     // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
760     // also adds FMA support both for single- and double-precision floating-point vectors
761     return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
762 #else
763     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
764 #endif
765 }
766
767 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
768 {
769     return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
770 }
771
772 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
773 {
774     return v_fma(a, b, c);
775 }
776
777 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
778 {
779     return v_fma(a, b, c);
780 }
781
782 #if CV_SIMD128_64F
783 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
784 {
785     v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
786     return v_sqrt(x);
787 }
788
789 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
790 {
791     return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
792 }
793
794 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
795 {
796     return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
797 }
798
799 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
800 {
801     return v_fma(a, b, c);
802 }
803 #endif
804
805 // trade efficiency for convenience
806 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
807 inline _Tpvec operator << (const _Tpvec& a, int n) \
808 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
809 inline _Tpvec operator >> (const _Tpvec& a, int n) \
810 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
811 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
812 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
813 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
814 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
815 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
816 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
817
818 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
819 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
820 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
821 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
822 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
823 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
824 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
825 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
826
827 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
828 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
829 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
830 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
831 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
832 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
833 { return a; } \
834 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
835 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
836 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
837 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
838 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
839 { CV_UNUSED(b); return a; }
840
841 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
842 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
843 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
844 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
845 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
846 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
847 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
848 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
849 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
850 #if CV_SIMD128_64F
851 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
852 #endif
853
854 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
855 inline _Tpvec v_load(const _Tp* ptr) \
856 { return _Tpvec(vld1q_##suffix(ptr)); } \
857 inline _Tpvec v_load_aligned(const _Tp* ptr) \
858 { return _Tpvec(vld1q_##suffix(ptr)); } \
859 inline _Tpvec v_load_low(const _Tp* ptr) \
860 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
861 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
862 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
863 inline void v_store(_Tp* ptr, const _Tpvec& a) \
864 { vst1q_##suffix(ptr, a.val); } \
865 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
866 { vst1q_##suffix(ptr, a.val); } \
867 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
868 { vst1q_##suffix(ptr, a.val); } \
869 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
870 { vst1q_##suffix(ptr, a.val); } \
871 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
872 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
873 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
874 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
875
876 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
877 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
878 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
879 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
880 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
881 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
882 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
883 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
884 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
885 #if CV_SIMD128_64F
886 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
887 #endif
888
889 #if CV_FP16
890 // Workaround for old comiplers
891 inline v_float16x8 v_load_f16(const short* ptr)
892 { return v_float16x8(cv_vld1q_f16(ptr)); }
893 inline v_float16x8 v_load_f16_aligned(const short* ptr)
894 { return v_float16x8(cv_vld1q_f16(ptr)); }
895
896 inline void v_store(short* ptr, const v_float16x8& a)
897 { cv_vst1q_f16(ptr, a.val); }
898 inline void v_store_aligned(short* ptr, const v_float16x8& a)
899 { cv_vst1q_f16(ptr, a.val); }
900 #endif
901
902 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
903 inline scalartype v_reduce_##func(const _Tpvec& a) \
904 { \
905     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
906     a0 = vp##vectorfunc##_##suffix(a0, a0); \
907     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
908 }
909
910 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
911 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
912 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
913 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
914 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
915 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
916
917 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
918 inline scalartype v_reduce_##func(const _Tpvec& a) \
919 { \
920     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
921     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
922 }
923
924 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
925 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
926 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
927 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
928 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
929 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
930 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
931 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
932 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
933
934 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
935                                  const v_float32x4& c, const v_float32x4& d)
936 {
937     float32x4x2_t ab = vtrnq_f32(a.val, b.val);
938     float32x4x2_t cd = vtrnq_f32(c.val, d.val);
939
940     float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
941     float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
942
943     float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
944     float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
945
946     return v_float32x4(vaddq_f32(v0, v1));
947 }
948
949 #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
950 inline v_uint32x4 v_popcount(const _Tpvec& a) \
951 { \
952     uint8x16_t t = vcntq_u8(cast(a.val)); \
953     uint16x8_t t0 = vpaddlq_u8(t);  /* 16 -> 8 */ \
954     uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
955     return v_uint32x4(t1); \
956 }
957
958 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
959 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
960 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
961 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
962 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
963 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
964
965 inline int v_signmask(const v_uint8x16& a)
966 {
967     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
968     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
969     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
970     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
971 }
972 inline int v_signmask(const v_int8x16& a)
973 { return v_signmask(v_reinterpret_as_u8(a)); }
974
975 inline int v_signmask(const v_uint16x8& a)
976 {
977     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
978     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
979     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
980     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
981 }
982 inline int v_signmask(const v_int16x8& a)
983 { return v_signmask(v_reinterpret_as_u16(a)); }
984
985 inline int v_signmask(const v_uint32x4& a)
986 {
987     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
988     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
989     uint64x2_t v1 = vpaddlq_u32(v0);
990     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
991 }
992 inline int v_signmask(const v_int32x4& a)
993 { return v_signmask(v_reinterpret_as_u32(a)); }
994 inline int v_signmask(const v_float32x4& a)
995 { return v_signmask(v_reinterpret_as_u32(a)); }
996 #if CV_SIMD128_64F
997 inline int v_signmask(const v_uint64x2& a)
998 {
999     int64x1_t m0 = vdup_n_s64(0);
1000     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1001     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1002 }
1003 inline int v_signmask(const v_float64x2& a)
1004 { return v_signmask(v_reinterpret_as_u64(a)); }
1005 #endif
1006
1007 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1008 inline bool v_check_all(const v_##_Tpvec& a) \
1009 { \
1010     _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1011     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1012     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1013 } \
1014 inline bool v_check_any(const v_##_Tpvec& a) \
1015 { \
1016     _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1017     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1018     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1019 }
1020
1021 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1022 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1023 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1024 #if CV_SIMD128_64F
1025 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63)
1026 #endif
1027
1028 inline bool v_check_all(const v_int8x16& a)
1029 { return v_check_all(v_reinterpret_as_u8(a)); }
1030 inline bool v_check_all(const v_int16x8& a)
1031 { return v_check_all(v_reinterpret_as_u16(a)); }
1032 inline bool v_check_all(const v_int32x4& a)
1033 { return v_check_all(v_reinterpret_as_u32(a)); }
1034 inline bool v_check_all(const v_float32x4& a)
1035 { return v_check_all(v_reinterpret_as_u32(a)); }
1036
1037 inline bool v_check_any(const v_int8x16& a)
1038 { return v_check_any(v_reinterpret_as_u8(a)); }
1039 inline bool v_check_any(const v_int16x8& a)
1040 { return v_check_any(v_reinterpret_as_u16(a)); }
1041 inline bool v_check_any(const v_int32x4& a)
1042 { return v_check_any(v_reinterpret_as_u32(a)); }
1043 inline bool v_check_any(const v_float32x4& a)
1044 { return v_check_any(v_reinterpret_as_u32(a)); }
1045
1046 #if CV_SIMD128_64F
1047 inline bool v_check_all(const v_int64x2& a)
1048 { return v_check_all(v_reinterpret_as_u64(a)); }
1049 inline bool v_check_all(const v_float64x2& a)
1050 { return v_check_all(v_reinterpret_as_u64(a)); }
1051 inline bool v_check_any(const v_int64x2& a)
1052 { return v_check_any(v_reinterpret_as_u64(a)); }
1053 inline bool v_check_any(const v_float64x2& a)
1054 { return v_check_any(v_reinterpret_as_u64(a)); }
1055 #endif
1056
1057 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1058 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1059 { \
1060     return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1061 }
1062
1063 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1064 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1065 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1066 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1067 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1068 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1069 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1070 #if CV_SIMD128_64F
1071 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1072 #endif
1073
1074 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1075 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1076 { \
1077     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1078     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1079 } \
1080 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1081 { \
1082     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1083 }
1084
1085 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1086 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1087 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1088 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1089 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1090 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1091
1092 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1093 {
1094     uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
1095     uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1096     return v_uint32x4(vmovl_u16(v1));
1097 }
1098
1099 inline v_int32x4 v_load_expand_q(const schar* ptr)
1100 {
1101     int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
1102     int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1103     return v_int32x4(vmovl_s16(v1));
1104 }
1105
1106 #if defined(__aarch64__)
1107 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1108 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1109 { \
1110     b0.val = vzip1q_##suffix(a0.val, a1.val); \
1111     b1.val = vzip2q_##suffix(a0.val, a1.val); \
1112 } \
1113 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1114 { \
1115     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1116 } \
1117 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1118 { \
1119     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1120 } \
1121 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1122 { \
1123     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1124     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1125 }
1126 #else
1127 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1128 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1129 { \
1130     _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1131     b0.val = p.val[0]; \
1132     b1.val = p.val[1]; \
1133 } \
1134 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1135 { \
1136     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1137 } \
1138 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1139 { \
1140     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1141 } \
1142 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1143 { \
1144     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1145     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1146 }
1147 #endif
1148
1149 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1150 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1151 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1152 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1153 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1154 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1155 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1156 #if CV_SIMD128_64F
1157 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1158 #endif
1159
1160 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1161 template <int s> \
1162 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1163 { \
1164     return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1165 }
1166
1167 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1168 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1169 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1170 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1171 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1172 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1173 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1174 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1175 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1176 #if CV_SIMD128_64F
1177 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1178 #endif
1179
1180 #if CV_SIMD128_64F
1181 inline v_int32x4 v_round(const v_float32x4& a)
1182 {
1183     float32x4_t a_ = a.val;
1184     int32x4_t result;
1185     __asm__ ("fcvtns %0.4s, %1.4s"
1186              : "=w"(result)
1187              : "w"(a_)
1188              : /* No clobbers */);
1189     return v_int32x4(result);
1190 }
1191 #else
1192 inline v_int32x4 v_round(const v_float32x4& a)
1193 {
1194     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1195         v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1196
1197     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1198     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1199 }
1200 #endif
1201 inline v_int32x4 v_floor(const v_float32x4& a)
1202 {
1203     int32x4_t a1 = vcvtq_s32_f32(a.val);
1204     uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1205     return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1206 }
1207
1208 inline v_int32x4 v_ceil(const v_float32x4& a)
1209 {
1210     int32x4_t a1 = vcvtq_s32_f32(a.val);
1211     uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1212     return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1213 }
1214
1215 inline v_int32x4 v_trunc(const v_float32x4& a)
1216 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1217
1218 #if CV_SIMD128_64F
1219 inline v_int32x4 v_round(const v_float64x2& a)
1220 {
1221     static const int32x2_t zero = vdup_n_s32(0);
1222     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1223 }
1224
1225 inline v_int32x4 v_floor(const v_float64x2& a)
1226 {
1227     static const int32x2_t zero = vdup_n_s32(0);
1228     int64x2_t a1 = vcvtq_s64_f64(a.val);
1229     uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1230     a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1231     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1232 }
1233
1234 inline v_int32x4 v_ceil(const v_float64x2& a)
1235 {
1236     static const int32x2_t zero = vdup_n_s32(0);
1237     int64x2_t a1 = vcvtq_s64_f64(a.val);
1238     uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1239     a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
1240     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1241 }
1242
1243 inline v_int32x4 v_trunc(const v_float64x2& a)
1244 {
1245     static const int32x2_t zero = vdup_n_s32(0);
1246     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1247 }
1248 #endif
1249
1250 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
1251 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1252                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1253                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
1254                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
1255 { \
1256     /* m00 m01 m02 m03 */ \
1257     /* m10 m11 m12 m13 */ \
1258     /* m20 m21 m22 m23 */ \
1259     /* m30 m31 m32 m33 */ \
1260     _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
1261     _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
1262     /* m00 m10 m02 m12 */ \
1263     /* m01 m11 m03 m13 */ \
1264     /* m20 m30 m22 m32 */ \
1265     /* m21 m31 m23 m33 */ \
1266     b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
1267     b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
1268     b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
1269     b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
1270 }
1271
1272 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
1273 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
1274 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
1275
1276 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
1277 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1278 { \
1279     _Tpvec##x2_t v = vld2q_##suffix(ptr); \
1280     a.val = v.val[0]; \
1281     b.val = v.val[1]; \
1282 } \
1283 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1284 { \
1285     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
1286     a.val = v.val[0]; \
1287     b.val = v.val[1]; \
1288     c.val = v.val[2]; \
1289 } \
1290 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1291                                 v_##_Tpvec& c, v_##_Tpvec& d) \
1292 { \
1293     _Tpvec##x4_t v = vld4q_##suffix(ptr); \
1294     a.val = v.val[0]; \
1295     b.val = v.val[1]; \
1296     c.val = v.val[2]; \
1297     d.val = v.val[3]; \
1298 } \
1299 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1300                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1301 { \
1302     _Tpvec##x2_t v; \
1303     v.val[0] = a.val; \
1304     v.val[1] = b.val; \
1305     vst2q_##suffix(ptr, v); \
1306 } \
1307 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1308                                 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1309 { \
1310     _Tpvec##x3_t v; \
1311     v.val[0] = a.val; \
1312     v.val[1] = b.val; \
1313     v.val[2] = c.val; \
1314     vst3q_##suffix(ptr, v); \
1315 } \
1316 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1317                                 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1318                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1319 { \
1320     _Tpvec##x4_t v; \
1321     v.val[0] = a.val; \
1322     v.val[1] = b.val; \
1323     v.val[2] = c.val; \
1324     v.val[3] = d.val; \
1325     vst4q_##suffix(ptr, v); \
1326 }
1327
1328 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
1329 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
1330 { \
1331     tp##x1_t a0 = vld1_##suffix(ptr); \
1332     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1333     tp##x1_t a1 = vld1_##suffix(ptr + 2); \
1334     tp##x1_t b1 = vld1_##suffix(ptr + 3); \
1335     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1336     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1337 } \
1338  \
1339 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
1340                                  v_##tp##x2& b, v_##tp##x2& c ) \
1341 { \
1342     tp##x1_t a0 = vld1_##suffix(ptr); \
1343     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1344     tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1345     tp##x1_t a1 = vld1_##suffix(ptr + 3); \
1346     tp##x1_t b1 = vld1_##suffix(ptr + 4); \
1347     tp##x1_t c1 = vld1_##suffix(ptr + 5); \
1348     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1349     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1350     c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1351 } \
1352  \
1353 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
1354                                  v_##tp##x2& c, v_##tp##x2& d ) \
1355 { \
1356     tp##x1_t a0 = vld1_##suffix(ptr); \
1357     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
1358     tp##x1_t c0 = vld1_##suffix(ptr + 2); \
1359     tp##x1_t d0 = vld1_##suffix(ptr + 3); \
1360     tp##x1_t a1 = vld1_##suffix(ptr + 4); \
1361     tp##x1_t b1 = vld1_##suffix(ptr + 5); \
1362     tp##x1_t c1 = vld1_##suffix(ptr + 6); \
1363     tp##x1_t d1 = vld1_##suffix(ptr + 7); \
1364     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
1365     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
1366     c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
1367     d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
1368 } \
1369  \
1370 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1371                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1372 { \
1373     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1374     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1375     vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
1376     vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
1377 } \
1378  \
1379 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
1380                                 const v_##tp##x2& b, const v_##tp##x2& c, \
1381                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1382 { \
1383     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1384     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1385     vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1386     vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
1387     vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
1388     vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
1389 } \
1390  \
1391 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
1392                                 const v_##tp##x2& c, const v_##tp##x2& d, \
1393                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1394 { \
1395     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
1396     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
1397     vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
1398     vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
1399     vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
1400     vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
1401     vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
1402     vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
1403 }
1404
1405 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
1406 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
1407 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
1408 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
1409 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
1410 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
1411 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
1412 #if CV_SIMD128_64F
1413 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
1414 #endif
1415
1416 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
1417 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
1418
1419 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1420 {
1421     return v_float32x4(vcvtq_f32_s32(a.val));
1422 }
1423
1424 #if CV_SIMD128_64F
1425 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1426 {
1427     float32x2_t zero = vdup_n_f32(0.0f);
1428     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
1429 }
1430
1431 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1432 {
1433     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
1434 }
1435
1436 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1437 {
1438     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
1439 }
1440
1441 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1442 {
1443     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
1444 }
1445
1446 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1447 {
1448     return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
1449 }
1450
1451 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1452 {
1453     return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
1454 }
1455 #endif
1456
1457 #if CV_FP16
1458 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
1459 {
1460     return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
1461 }
1462 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
1463 {
1464     return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
1465 }
1466
1467 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
1468 {
1469     return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
1470 }
1471 #endif
1472
1473 ////////////// Lookup table access ////////////////////
1474
1475 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1476 {
1477     int CV_DECL_ALIGNED(32) elems[4] =
1478     {
1479         tab[vgetq_lane_s32(idxvec.val, 0)],
1480         tab[vgetq_lane_s32(idxvec.val, 1)],
1481         tab[vgetq_lane_s32(idxvec.val, 2)],
1482         tab[vgetq_lane_s32(idxvec.val, 3)]
1483     };
1484     return v_int32x4(vld1q_s32(elems));
1485 }
1486
1487 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1488 {
1489     float CV_DECL_ALIGNED(32) elems[4] =
1490     {
1491         tab[vgetq_lane_s32(idxvec.val, 0)],
1492         tab[vgetq_lane_s32(idxvec.val, 1)],
1493         tab[vgetq_lane_s32(idxvec.val, 2)],
1494         tab[vgetq_lane_s32(idxvec.val, 3)]
1495     };
1496     return v_float32x4(vld1q_f32(elems));
1497 }
1498
1499 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1500 {
1501     /*int CV_DECL_ALIGNED(32) idx[4];
1502     v_store(idx, idxvec);
1503
1504     float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
1505     float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
1506
1507     float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
1508     x = v_float32x4(xxyy.val[0]);
1509     y = v_float32x4(xxyy.val[1]);*/
1510     int CV_DECL_ALIGNED(32) idx[4];
1511     v_store_aligned(idx, idxvec);
1512
1513     x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1514     y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1515 }
1516
1517 #if CV_SIMD128_64F
1518 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1519 {
1520     double CV_DECL_ALIGNED(32) elems[2] =
1521     {
1522         tab[vgetq_lane_s32(idxvec.val, 0)],
1523         tab[vgetq_lane_s32(idxvec.val, 1)],
1524     };
1525     return v_float64x2(vld1q_f64(elems));
1526 }
1527
1528 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1529 {
1530     int CV_DECL_ALIGNED(32) idx[4];
1531     v_store_aligned(idx, idxvec);
1532
1533     x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1534     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1535 }
1536 #endif
1537
1538 inline void v_cleanup() {}
1539
1540 //! @name Check SIMD support
1541 //! @{
1542 //! @brief Check CPU capability of SIMD operation
1543 static inline bool hasSIMD128()
1544 {
1545     return (CV_CPU_HAS_SUPPORT_NEON) ? true : false;
1546 }
1547
1548 //! @}
1549
1550 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1551
1552 //! @endcond
1553
1554 }
1555
1556 #endif