modules/core/include/opencv2/core/hal/intrin_neon.hpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                          License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17 // Third party copyrights are property of their respective owners.
  18 //
  19 // Redistribution and use in source and binary forms, with or without modification,
  20 // are permitted provided that the following conditions are met:
  21 //
  22 //   * Redistribution's of source code must retain the above copyright notice,
  23 //     this list of conditions and the following disclaimer.
  24 //
  25 //   * Redistribution's in binary form must reproduce the above copyright notice,
  26 //     this list of conditions and the following disclaimer in the documentation
  27 //     and/or other materials provided with the distribution.
  28 //
  29 //   * The name of the copyright holders may not be used to endorse or promote products
  30 //     derived from this software without specific prior written permission.
  31 //
  32 // This software is provided by the copyright holders and contributors "as is" and
  33 // any express or implied warranties, including, but not limited to, the implied
  34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35 // In no event shall the Intel Corporation or contributors be liable for any direct,
  36 // indirect, incidental, special, exemplary, or consequential damages
  37 // (including, but not limited to, procurement of substitute goods or services;
  38 // loss of use, data, or profits; or business interruption) however caused
  39 // and on any theory of liability, whether in contract, strict liability,
  40 // or tort (including negligence or otherwise) arising in any way out of
  41 // the use of this software, even if advised of the possibility of such damage.
  42 //
  43 //M*/
  44
  45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
  46 #define OPENCV_HAL_INTRIN_NEON_HPP
  47
  48 #include <algorithm>
  49 #include "opencv2/core/utility.hpp"
  50
  51 namespace cv
  52 {
  53
  54 //! @cond IGNORED
  55
  56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  57
  58 #define CV_SIMD128 1
  59 #if defined(__aarch64__)
  60 #define CV_SIMD128_64F 1
  61 #else
  62 #define CV_SIMD128_64F 0
  63 #endif
  64
  65 #if CV_SIMD128_64F
  66 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
  67 template <typename T> static inline \
  68 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
  69 template <typename T> static inline \
  70 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
  71 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
  72 OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
  73 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
  74 OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
  75 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
  76 OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
  77 OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
  78 OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
  79 OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
  80 #endif
  81
  82 struct v_uint8x16
  83 {
  84     typedef uchar lane_type;
  85     enum { nlanes = 16 };
  86
  87     v_uint8x16() {}
  88     explicit v_uint8x16(uint8x16_t v) : val(v) {}
  89     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
  90                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
  91     {
  92         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
  93         val = vld1q_u8(v);
  94     }
  95     uchar get0() const
  96     {
  97         return vgetq_lane_u8(val, 0);
  98     }
  99
 100     uint8x16_t val;
 101 };
 102
 103 struct v_int8x16
 104 {
 105     typedef schar lane_type;
 106     enum { nlanes = 16 };
 107
 108     v_int8x16() {}
 109     explicit v_int8x16(int8x16_t v) : val(v) {}
 110     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
 111                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
 112     {
 113         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
 114         val = vld1q_s8(v);
 115     }
 116     schar get0() const
 117     {
 118         return vgetq_lane_s8(val, 0);
 119     }
 120
 121     int8x16_t val;
 122 };
 123
 124 struct v_uint16x8
 125 {
 126     typedef ushort lane_type;
 127     enum { nlanes = 8 };
 128
 129     v_uint16x8() {}
 130     explicit v_uint16x8(uint16x8_t v) : val(v) {}
 131     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
 132     {
 133         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
 134         val = vld1q_u16(v);
 135     }
 136     ushort get0() const
 137     {
 138         return vgetq_lane_u16(val, 0);
 139     }
 140
 141     uint16x8_t val;
 142 };
 143
 144 struct v_int16x8
 145 {
 146     typedef short lane_type;
 147     enum { nlanes = 8 };
 148
 149     v_int16x8() {}
 150     explicit v_int16x8(int16x8_t v) : val(v) {}
 151     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
 152     {
 153         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
 154         val = vld1q_s16(v);
 155     }
 156     short get0() const
 157     {
 158         return vgetq_lane_s16(val, 0);
 159     }
 160
 161     int16x8_t val;
 162 };
 163
 164 struct v_uint32x4
 165 {
 166     typedef unsigned lane_type;
 167     enum { nlanes = 4 };
 168
 169     v_uint32x4() {}
 170     explicit v_uint32x4(uint32x4_t v) : val(v) {}
 171     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
 172     {
 173         unsigned v[] = {v0, v1, v2, v3};
 174         val = vld1q_u32(v);
 175     }
 176     unsigned get0() const
 177     {
 178         return vgetq_lane_u32(val, 0);
 179     }
 180
 181     uint32x4_t val;
 182 };
 183
 184 struct v_int32x4
 185 {
 186     typedef int lane_type;
 187     enum { nlanes = 4 };
 188
 189     v_int32x4() {}
 190     explicit v_int32x4(int32x4_t v) : val(v) {}
 191     v_int32x4(int v0, int v1, int v2, int v3)
 192     {
 193         int v[] = {v0, v1, v2, v3};
 194         val = vld1q_s32(v);
 195     }
 196     int get0() const
 197     {
 198         return vgetq_lane_s32(val, 0);
 199     }
 200     int32x4_t val;
 201 };
 202
 203 struct v_float32x4
 204 {
 205     typedef float lane_type;
 206     enum { nlanes = 4 };
 207
 208     v_float32x4() {}
 209     explicit v_float32x4(float32x4_t v) : val(v) {}
 210     v_float32x4(float v0, float v1, float v2, float v3)
 211     {
 212         float v[] = {v0, v1, v2, v3};
 213         val = vld1q_f32(v);
 214     }
 215     float get0() const
 216     {
 217         return vgetq_lane_f32(val, 0);
 218     }
 219     float32x4_t val;
 220 };
 221
 222 struct v_uint64x2
 223 {
 224     typedef uint64 lane_type;
 225     enum { nlanes = 2 };
 226
 227     v_uint64x2() {}
 228     explicit v_uint64x2(uint64x2_t v) : val(v) {}
 229     v_uint64x2(uint64 v0, uint64 v1)
 230     {
 231         uint64 v[] = {v0, v1};
 232         val = vld1q_u64(v);
 233     }
 234     uint64 get0() const
 235     {
 236         return vgetq_lane_u64(val, 0);
 237     }
 238     uint64x2_t val;
 239 };
 240
 241 struct v_int64x2
 242 {
 243     typedef int64 lane_type;
 244     enum { nlanes = 2 };
 245
 246     v_int64x2() {}
 247     explicit v_int64x2(int64x2_t v) : val(v) {}
 248     v_int64x2(int64 v0, int64 v1)
 249     {
 250         int64 v[] = {v0, v1};
 251         val = vld1q_s64(v);
 252     }
 253     int64 get0() const
 254     {
 255         return vgetq_lane_s64(val, 0);
 256     }
 257     int64x2_t val;
 258 };
 259
 260 #if CV_SIMD128_64F
 261 struct v_float64x2
 262 {
 263     typedef double lane_type;
 264     enum { nlanes = 2 };
 265
 266     v_float64x2() {}
 267     explicit v_float64x2(float64x2_t v) : val(v) {}
 268     v_float64x2(double v0, double v1)
 269     {
 270         double v[] = {v0, v1};
 271         val = vld1q_f64(v);
 272     }
 273     double get0() const
 274     {
 275         return vgetq_lane_f64(val, 0);
 276     }
 277     float64x2_t val;
 278 };
 279 #endif
 280
 281 #if CV_FP16
 282 // Workaround for old compilers
 283 static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
 284 static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
 285 static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
 286 static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
 287
 288 static inline float16x8_t cv_vld1q_f16(const void* ptr)
 289 {
 290 #ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
 291     return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
 292 #else
 293     return vld1q_f16((const __fp16*)ptr);
 294 #endif
 295 }
 296 static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
 297 {
 298 #ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
 299     vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
 300 #else
 301     vst1q_f16((__fp16*)ptr, a);
 302 #endif
 303 }
 304
 305 static inline float16x4_t cv_vld1_f16(const void* ptr)
 306 {
 307 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
 308     return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
 309 #else
 310     return vld1_f16((const __fp16*)ptr);
 311 #endif
 312 }
 313 static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 314 {
 315 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
 316     vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
 317 #else
 318     vst1_f16((__fp16*)ptr, a);
 319 #endif
 320 }
 321
 322
 323 struct v_float16x8
 324 {
 325     typedef short lane_type;
 326     enum { nlanes = 8 };
 327
 328     v_float16x8() {}
 329     explicit v_float16x8(float16x8_t v) : val(v) {}
 330     v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
 331     {
 332         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
 333         val = cv_vld1q_f16(v);
 334     }
 335     short get0() const
 336     {
 337         return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
 338     }
 339     float16x8_t val;
 340 };
 341
 342 inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
 343 inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
 344 #endif
 345
 346 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 347 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 348 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
 349 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
 350 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
 351 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
 352 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
 353 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
 354 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
 355 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
 356 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
 357 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
 358 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
 359
 360 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
 361 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
 362 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
 363 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
 364 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
 365 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
 366 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
 367 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
 368 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
 369 #if CV_SIMD128_64F
 370 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
 371 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
 372 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
 373 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
 374 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
 375 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
 376 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
 377 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
 378 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
 379 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
 380 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
 381 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
 382 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
 383 #endif
 384
 385 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
 386 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
 387 { \
 388     hreg a1 = mov(a.val), b1 = mov(b.val); \
 389     return _Tpvec(vcombine_##suffix(a1, b1)); \
 390 } \
 391 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
 392 { \
 393     hreg a1 = mov(a.val); \
 394     vst1_##suffix(ptr, a1); \
 395 } \
 396 template<int n> inline \
 397 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
 398 { \
 399     hreg a1 = rshr(a.val, n); \
 400     hreg b1 = rshr(b.val, n); \
 401     return _Tpvec(vcombine_##suffix(a1, b1)); \
 402 } \
 403 template<int n> inline \
 404 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
 405 { \
 406     hreg a1 = rshr(a.val, n); \
 407     vst1_##suffix(ptr, a1); \
 408 }
 409
 410 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
 411 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
 412 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
 413 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
 414 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
 415 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
 416
 417 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
 418 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
 419
 420 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
 421                             const v_float32x4& m1, const v_float32x4& m2,
 422                             const v_float32x4& m3)
 423 {
 424     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
 425     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
 426     res = vmlaq_lane_f32(res, m1.val, vl, 1);
 427     res = vmlaq_lane_f32(res, m2.val, vh, 0);
 428     res = vmlaq_lane_f32(res, m3.val, vh, 1);
 429     return v_float32x4(res);
 430 }
 431
 432 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 433                                const v_float32x4& m1, const v_float32x4& m2,
 434                                const v_float32x4& a)
 435 {
 436     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
 437     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
 438     res = vmlaq_lane_f32(res, m1.val, vl, 1);
 439     res = vmlaq_lane_f32(res, m2.val, vh, 0);
 440     res = vaddq_f32(res, a.val);
 441     return v_float32x4(res);
 442 }
 443
 444 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
 445 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 446 { \
 447     return _Tpvec(intrin(a.val, b.val)); \
 448 } \
 449 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
 450 { \
 451     a.val = intrin(a.val, b.val); \
 452     return a; \
 453 }
 454
 455 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
 456 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
 457 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
 458 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
 459 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
 460 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
 461 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
 462 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
 463 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
 464 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
 465 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
 466 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
 467 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
 468 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
 469 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
 470 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
 471 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
 472 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
 473 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
 474 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
 475 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
 476 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
 477 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
 478 #if CV_SIMD128_64F
 479 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
 480 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
 481 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
 482 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
 483 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
 484 #else
 485 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
 486 {
 487     float32x4_t reciprocal = vrecpeq_f32(b.val);
 488     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
 489     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
 490     return v_float32x4(vmulq_f32(a.val, reciprocal));
 491 }
 492 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
 493 {
 494     float32x4_t reciprocal = vrecpeq_f32(b.val);
 495     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
 496     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
 497     a.val = vmulq_f32(a.val, reciprocal);
 498     return a;
 499 }
 500 #endif
 501
 502 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 503                          v_int32x4& c, v_int32x4& d)
 504 {
 505     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
 506     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
 507 }
 508
 509 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 510                          v_uint32x4& c, v_uint32x4& d)
 511 {
 512     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
 513     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
 514 }
 515
 516 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 517                          v_uint64x2& c, v_uint64x2& d)
 518 {
 519     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
 520     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
 521 }
 522
 523 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 524 {
 525     int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
 526     int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
 527     int32x4x2_t cd = vuzpq_s32(c, d);
 528     return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
 529 }
 530
 531 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 532 {
 533     v_int32x4 s = v_dotprod(a, b);
 534     return v_int32x4(vaddq_s32(s.val , c.val));
 535 }
 536
 537 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
 538     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
 539     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
 540     OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
 541     inline _Tpvec operator ~ (const _Tpvec& a) \
 542     { \
 543         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
 544     }
 545
 546 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
 547 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
 548 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
 549 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
 550 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
 551 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
 552 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
 553 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
 554
 555 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
 556 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
 557 { \
 558     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
 559 } \
 560 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
 561 { \
 562     a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
 563     return a; \
 564 }
 565
 566 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
 567 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
 568 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
 569
 570 inline v_float32x4 operator ~ (const v_float32x4& a)
 571 {
 572     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
 573 }
 574
 575 #if CV_SIMD128_64F
 576 inline v_float32x4 v_sqrt(const v_float32x4& x)
 577 {
 578     return v_float32x4(vsqrtq_f32(x.val));
 579 }
 580
 581 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 582 {
 583     v_float32x4 one = v_setall_f32(1.0f);
 584     return one / v_sqrt(x);
 585 }
 586 #else
 587 inline v_float32x4 v_sqrt(const v_float32x4& x)
 588 {
 589     float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
 590     float32x4_t e = vrsqrteq_f32(x1);
 591     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
 592     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
 593     return v_float32x4(vmulq_f32(x.val, e));
 594 }
 595
 596 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 597 {
 598     float32x4_t e = vrsqrteq_f32(x.val);
 599     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
 600     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
 601     return v_float32x4(e);
 602 }
 603 #endif
 604
 605 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
 606 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
 607
 608 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
 609 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
 610 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
 611
 612 inline v_float32x4 v_abs(v_float32x4 x)
 613 { return v_float32x4(vabsq_f32(x.val)); }
 614
 615 #if CV_SIMD128_64F
 616 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
 617 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
 618 { \
 619     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
 620 } \
 621 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
 622 { \
 623     a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
 624     return a; \
 625 }
 626
 627 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
 628 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
 629 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
 630
 631 inline v_float64x2 operator ~ (const v_float64x2& a)
 632 {
 633     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
 634 }
 635
 636 inline v_float64x2 v_sqrt(const v_float64x2& x)
 637 {
 638     return v_float64x2(vsqrtq_f64(x.val));
 639 }
 640
 641 inline v_float64x2 v_invsqrt(const v_float64x2& x)
 642 {
 643     v_float64x2 one = v_setall_f64(1.0f);
 644     return one / v_sqrt(x);
 645 }
 646
 647 inline v_float64x2 v_abs(v_float64x2 x)
 648 { return v_float64x2(vabsq_f64(x.val)); }
 649 #endif
 650
 651 // TODO: exp, log, sin, cos
 652
 653 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
 654 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
 655 { \
 656     return _Tpvec(intrin(a.val, b.val)); \
 657 }
 658
 659 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
 660 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
 661 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
 662 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
 663 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
 664 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
 665 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
 666 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
 667 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
 668 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
 669 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
 670 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
 671 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
 672 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
 673 #if CV_SIMD128_64F
 674 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
 675 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
 676 #endif
 677
 678 #if CV_SIMD128_64F
 679 inline int64x2_t vmvnq_s64(int64x2_t a)
 680 {
 681     int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
 682     return veorq_s64(a, vx);
 683 }
 684 inline uint64x2_t vmvnq_u64(uint64x2_t a)
 685 {
 686     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
 687     return veorq_u64(a, vx);
 688 }
 689 #endif
 690 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
 691 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 692 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
 693 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 694 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
 695 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
 696 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
 697 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
 698 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
 699 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
 700 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
 701 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 702 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
 703
 704 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
 705 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
 706 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
 707 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
 708 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
 709 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
 710 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
 711 #if CV_SIMD128_64F
 712 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
 713 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
 714 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
 715 #endif
 716
 717 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
 718 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
 719 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
 720 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
 721 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
 722 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
 723 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
 724 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
 725
 726 // TODO: absdiff for signed integers
 727 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
 728 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
 729 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
 730 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
 731 #if CV_SIMD128_64F
 732 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
 733 #endif
 734
 735 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
 736 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
 737 { \
 738     return _Tpvec2(cast(intrin(a.val, b.val))); \
 739 }
 740
 741 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
 742 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
 743 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
 744
 745 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
 746 {
 747     v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
 748     return v_sqrt(x);
 749 }
 750
 751 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
 752 {
 753     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
 754 }
 755
 756 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 757 {
 758 #if CV_SIMD128_64F
 759     // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
 760     // also adds FMA support both for single- and double-precision floating-point vectors
 761     return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
 762 #else
 763     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
 764 #endif
 765 }
 766
 767 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 768 {
 769     return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
 770 }
 771
 772 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 773 {
 774     return v_fma(a, b, c);
 775 }
 776
 777 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 778 {
 779     return v_fma(a, b, c);
 780 }
 781
 782 #if CV_SIMD128_64F
 783 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 784 {
 785     v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
 786     return v_sqrt(x);
 787 }
 788
 789 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
 790 {
 791     return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
 792 }
 793
 794 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 795 {
 796     return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
 797 }
 798
 799 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 800 {
 801     return v_fma(a, b, c);
 802 }
 803 #endif
 804
 805 // trade efficiency for convenience
 806 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
 807 inline _Tpvec operator << (const _Tpvec& a, int n) \
 808 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
 809 inline _Tpvec operator >> (const _Tpvec& a, int n) \
 810 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
 811 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 812 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
 813 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 814 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
 815 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
 816 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
 817
 818 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
 819 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
 820 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
 821 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
 822 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
 823 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
 824 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
 825 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
 826
 827 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
 828 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 829 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 830 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 831 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
 832 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 833 { return a; } \
 834 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 835 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 836 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 837 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
 838 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 839 { CV_UNUSED(b); return a; }
 840
 841 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
 842 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
 843 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
 844 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
 845 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
 846 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
 847 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
 848 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
 849 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
 850 #if CV_SIMD128_64F
 851 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
 852 #endif
 853
 854 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
 855 inline _Tpvec v_load(const _Tp* ptr) \
 856 { return _Tpvec(vld1q_##suffix(ptr)); } \
 857 inline _Tpvec v_load_aligned(const _Tp* ptr) \
 858 { return _Tpvec(vld1q_##suffix(ptr)); } \
 859 inline _Tpvec v_load_low(const _Tp* ptr) \
 860 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
 861 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
 862 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
 863 inline void v_store(_Tp* ptr, const _Tpvec& a) \
 864 { vst1q_##suffix(ptr, a.val); } \
 865 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 866 { vst1q_##suffix(ptr, a.val); } \
 867 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 868 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
 869 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 870 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
 871
 872 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
 873 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
 874 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
 875 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
 876 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
 877 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
 878 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
 879 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
 880 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 881 #if CV_SIMD128_64F
 882 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 883 #endif
 884
 885 #if CV_FP16
 886 // Workaround for old comiplers
 887 inline v_float16x8 v_load_f16(const short* ptr)
 888 { return v_float16x8(cv_vld1q_f16(ptr)); }
 889 inline v_float16x8 v_load_f16_aligned(const short* ptr)
 890 { return v_float16x8(cv_vld1q_f16(ptr)); }
 891
 892 inline void v_store(short* ptr, const v_float16x8& a)
 893 { cv_vst1q_f16(ptr, a.val); }
 894 inline void v_store_aligned(short* ptr, const v_float16x8& a)
 895 { cv_vst1q_f16(ptr, a.val); }
 896 #endif
 897
 898 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 899 inline scalartype v_reduce_##func(const _Tpvec& a) \
 900 { \
 901     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
 902     a0 = vp##vectorfunc##_##suffix(a0, a0); \
 903     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 904 }
 905
 906 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
 907 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
 908 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
 909 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
 910 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
 911 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
 912
 913 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 914 inline scalartype v_reduce_##func(const _Tpvec& a) \
 915 { \
 916     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
 917     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
 918 }
 919
 920 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
 921 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
 922 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
 923 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
 924 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
 925 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
 926 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
 927 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
 928 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
 929
 930 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
 931                                  const v_float32x4& c, const v_float32x4& d)
 932 {
 933     float32x4x2_t ab = vtrnq_f32(a.val, b.val);
 934     float32x4x2_t cd = vtrnq_f32(c.val, d.val);
 935
 936     float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
 937     float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
 938
 939     float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
 940     float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
 941
 942     return v_float32x4(vaddq_f32(v0, v1));
 943 }
 944
 945 #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
 946 inline v_uint32x4 v_popcount(const _Tpvec& a) \
 947 { \
 948     uint8x16_t t = vcntq_u8(cast(a.val)); \
 949     uint16x8_t t0 = vpaddlq_u8(t);  /* 16 -> 8 */ \
 950     uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
 951     return v_uint32x4(t1); \
 952 }
 953
 954 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
 955 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
 956 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
 957 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
 958 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
 959 OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
 960
 961 inline int v_signmask(const v_uint8x16& a)
 962 {
 963     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
 964     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
 965     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
 966     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
 967 }
 968 inline int v_signmask(const v_int8x16& a)
 969 { return v_signmask(v_reinterpret_as_u8(a)); }
 970
 971 inline int v_signmask(const v_uint16x8& a)
 972 {
 973     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
 974     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
 975     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
 976     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
 977 }
 978 inline int v_signmask(const v_int16x8& a)
 979 { return v_signmask(v_reinterpret_as_u16(a)); }
 980
 981 inline int v_signmask(const v_uint32x4& a)
 982 {
 983     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
 984     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
 985     uint64x2_t v1 = vpaddlq_u32(v0);
 986     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
 987 }
 988 inline int v_signmask(const v_int32x4& a)
 989 { return v_signmask(v_reinterpret_as_u32(a)); }
 990 inline int v_signmask(const v_float32x4& a)
 991 { return v_signmask(v_reinterpret_as_u32(a)); }
 992 #if CV_SIMD128_64F
 993 inline int v_signmask(const v_uint64x2& a)
 994 {
 995     int64x1_t m0 = vdup_n_s64(0);
 996     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
 997     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
 998 }
 999 inline int v_signmask(const v_float64x2& a)
1000 { return v_signmask(v_reinterpret_as_u64(a)); }
1001 #endif
1002
1003 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1004 inline bool v_check_all(const v_##_Tpvec& a) \
1005 { \
1006     _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1007     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1008     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1009 } \
1010 inline bool v_check_any(const v_##_Tpvec& a) \
1011 { \
1012     _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1013     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1014     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1015 }
1016
1017 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1018 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1019 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1020 #if CV_SIMD128_64F
1021 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63)
1022 #endif
1023
1024 inline bool v_check_all(const v_int8x16& a)
1025 { return v_check_all(v_reinterpret_as_u8(a)); }
1026 inline bool v_check_all(const v_int16x8& a)
1027 { return v_check_all(v_reinterpret_as_u16(a)); }
1028 inline bool v_check_all(const v_int32x4& a)
1029 { return v_check_all(v_reinterpret_as_u32(a)); }
1030 inline bool v_check_all(const v_float32x4& a)
1031 { return v_check_all(v_reinterpret_as_u32(a)); }
1032
1033 inline bool v_check_any(const v_int8x16& a)
1034 { return v_check_any(v_reinterpret_as_u8(a)); }
1035 inline bool v_check_any(const v_int16x8& a)
1036 { return v_check_any(v_reinterpret_as_u16(a)); }
1037 inline bool v_check_any(const v_int32x4& a)
1038 { return v_check_any(v_reinterpret_as_u32(a)); }
1039 inline bool v_check_any(const v_float32x4& a)
1040 { return v_check_any(v_reinterpret_as_u32(a)); }
1041
1042 #if CV_SIMD128_64F
1043 inline bool v_check_all(const v_int64x2& a)
1044 { return v_check_all(v_reinterpret_as_u64(a)); }
1045 inline bool v_check_all(const v_float64x2& a)
1046 { return v_check_all(v_reinterpret_as_u64(a)); }
1047 inline bool v_check_any(const v_int64x2& a)
1048 { return v_check_any(v_reinterpret_as_u64(a)); }
1049 inline bool v_check_any(const v_float64x2& a)
1050 { return v_check_any(v_reinterpret_as_u64(a)); }
1051 #endif
1052
1053 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1054 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1055 { \
1056     return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1057 }
1058
1059 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1060 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1061 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1062 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1063 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1064 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1065 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1066 #if CV_SIMD128_64F
1067 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1068 #endif
1069
1070 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1071 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1072 { \
1073     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1074     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1075 } \
1076 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1077 { \
1078     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1079 }
1080
1081 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1082 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1083 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1084 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1085 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1086 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1087
1088 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1089 {
1090     uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
1091     uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1092     return v_uint32x4(vmovl_u16(v1));
1093 }
1094
1095 inline v_int32x4 v_load_expand_q(const schar* ptr)
1096 {
1097     int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
1098     int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1099     return v_int32x4(vmovl_s16(v1));
1100 }
1101
1102 #if defined(__aarch64__)
1103 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1104 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1105 { \
1106     b0.val = vzip1q_##suffix(a0.val, a1.val); \
1107     b1.val = vzip2q_##suffix(a0.val, a1.val); \
1108 } \
1109 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1110 { \
1111     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1112 } \
1113 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1114 { \
1115     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1116 } \
1117 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1118 { \
1119     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1120     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1121 }
1122 #else
1123 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1124 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1125 { \
1126     _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1127     b0.val = p.val[0]; \
1128     b1.val = p.val[1]; \
1129 } \
1130 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1131 { \
1132     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1133 } \
1134 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1135 { \
1136     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1137 } \
1138 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1139 { \
1140     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1141     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1142 }
1143 #endif
1144
1145 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1146 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1147 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1148 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1149 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1150 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1151 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1152 #if CV_SIMD128_64F
1153 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1154 #endif
1155
1156 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1157 template <int s> \
1158 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1159 { \
1160     return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1161 }
1162
1163 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1164 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1165 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1166 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1167 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1168 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1169 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1170 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1171 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1172 #if CV_SIMD128_64F
1173 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1174 #endif
1175
1176 #if CV_SIMD128_64F
1177 inline v_int32x4 v_round(const v_float32x4& a)
1178 {
1179     float32x4_t a_ = a.val;
1180     int32x4_t result;
1181     __asm__ ("fcvtns %0.4s, %1.4s"
1182              : "=w"(result)
1183              : "w"(a_)
1184              : /* No clobbers */);
1185     return v_int32x4(result);
1186 }
1187 #else
1188 inline v_int32x4 v_round(const v_float32x4& a)
1189 {
1190     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1191         v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1192
1193     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1194     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1195 }
1196 #endif
1197 inline v_int32x4 v_floor(const v_float32x4& a)
1198 {
1199     int32x4_t a1 = vcvtq_s32_f32(a.val);
1200     uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1201     return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1202 }
1203
1204 inline v_int32x4 v_ceil(const v_float32x4& a)
1205 {
1206     int32x4_t a1 = vcvtq_s32_f32(a.val);
1207     uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1208     return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1209 }
1210
1211 inline v_int32x4 v_trunc(const v_float32x4& a)
1212 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1213
1214 #if CV_SIMD128_64F
1215 inline v_int32x4 v_round(const v_float64x2& a)
1216 {
1217     static const int32x2_t zero = vdup_n_s32(0);
1218     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1219 }
1220
1221 inline v_int32x4 v_floor(const v_float64x2& a)
1222 {
1223     static const int32x2_t zero = vdup_n_s32(0);
1224     int64x2_t a1 = vcvtq_s64_f64(a.val);
1225     uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1226     a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1227     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1228 }
1229
1230 inline v_int32x4 v_ceil(const v_float64x2& a)
1231 {
1232     static const int32x2_t zero = vdup_n_s32(0);
1233     int64x2_t a1 = vcvtq_s64_f64(a.val);
1234     uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1235     a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
1236     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1237 }
1238
1239 inline v_int32x4 v_trunc(const v_float64x2& a)
1240 {
1241     static const int32x2_t zero = vdup_n_s32(0);
1242     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1243 }
1244 #endif
1245
1246 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
1247 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1248                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1249                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
1250                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
1251 { \
1252     /* m00 m01 m02 m03 */ \
1253     /* m10 m11 m12 m13 */ \
1254     /* m20 m21 m22 m23 */ \
1255     /* m30 m31 m32 m33 */ \
1256     _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
1257     _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
1258     /* m00 m10 m02 m12 */ \
1259     /* m01 m11 m03 m13 */ \
1260     /* m20 m30 m22 m32 */ \
1261     /* m21 m31 m23 m33 */ \
1262     b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
1263     b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
1264     b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
1265     b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
1266 }
1267
1268 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
1269 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
1270 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
1271
1272 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
1273 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1274 { \
1275     _Tpvec##x2_t v = vld2q_##suffix(ptr); \
1276     a.val = v.val[0]; \
1277     b.val = v.val[1]; \
1278 } \
1279 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1280 { \
1281     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
1282     a.val = v.val[0]; \
1283     b.val = v.val[1]; \
1284     c.val = v.val[2]; \
1285 } \
1286 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1287                                 v_##_Tpvec& c, v_##_Tpvec& d) \
1288 { \
1289     _Tpvec##x4_t v = vld4q_##suffix(ptr); \
1290     a.val = v.val[0]; \
1291     b.val = v.val[1]; \
1292     c.val = v.val[2]; \
1293     d.val = v.val[3]; \
1294 } \
1295 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
1296 { \
1297     _Tpvec##x2_t v; \
1298     v.val[0] = a.val; \
1299     v.val[1] = b.val; \
1300     vst2q_##suffix(ptr, v); \
1301 } \
1302 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
1303 { \
1304     _Tpvec##x3_t v; \
1305     v.val[0] = a.val; \
1306     v.val[1] = b.val; \
1307     v.val[2] = c.val; \
1308     vst3q_##suffix(ptr, v); \
1309 } \
1310 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1311                                const v_##_Tpvec& c, const v_##_Tpvec& d) \
1312 { \
1313     _Tpvec##x4_t v; \
1314     v.val[0] = a.val; \
1315     v.val[1] = b.val; \
1316     v.val[2] = c.val; \
1317     v.val[3] = d.val; \
1318     vst4q_##suffix(ptr, v); \
1319 }
1320
1321 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
1322 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
1323 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
1324 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
1325 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
1326 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
1327 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
1328 #if CV_SIMD128_64F
1329 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
1330 #endif
1331
1332 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1333 {
1334     return v_float32x4(vcvtq_f32_s32(a.val));
1335 }
1336
1337 #if CV_SIMD128_64F
1338 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1339 {
1340     float32x2_t zero = vdup_n_f32(0.0f);
1341     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
1342 }
1343
1344 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1345 {
1346     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
1347 }
1348
1349 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1350 {
1351     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
1352 }
1353
1354 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1355 {
1356     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
1357 }
1358
1359 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1360 {
1361     return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
1362 }
1363
1364 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1365 {
1366     return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
1367 }
1368 #endif
1369
1370 #if CV_FP16
1371 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
1372 {
1373     return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
1374 }
1375 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
1376 {
1377     return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
1378 }
1379
1380 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
1381 {
1382     return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
1383 }
1384 #endif
1385
1386 ////////////// Lookup table access ////////////////////
1387
1388 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1389 {
1390     int CV_DECL_ALIGNED(32) elems[4] =
1391     {
1392         tab[vgetq_lane_s32(idxvec.val, 0)],
1393         tab[vgetq_lane_s32(idxvec.val, 1)],
1394         tab[vgetq_lane_s32(idxvec.val, 2)],
1395         tab[vgetq_lane_s32(idxvec.val, 3)]
1396     };
1397     return v_int32x4(vld1q_s32(elems));
1398 }
1399
1400 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1401 {
1402     float CV_DECL_ALIGNED(32) elems[4] =
1403     {
1404         tab[vgetq_lane_s32(idxvec.val, 0)],
1405         tab[vgetq_lane_s32(idxvec.val, 1)],
1406         tab[vgetq_lane_s32(idxvec.val, 2)],
1407         tab[vgetq_lane_s32(idxvec.val, 3)]
1408     };
1409     return v_float32x4(vld1q_f32(elems));
1410 }
1411
1412 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1413 {
1414     /*int CV_DECL_ALIGNED(32) idx[4];
1415     v_store(idx, idxvec);
1416
1417     float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
1418     float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
1419
1420     float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
1421     x = v_float32x4(xxyy.val[0]);
1422     y = v_float32x4(xxyy.val[1]);*/
1423     int CV_DECL_ALIGNED(32) idx[4];
1424     v_store_aligned(idx, idxvec);
1425
1426     x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1427     y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1428 }
1429
1430 #if CV_SIMD128_64F
1431 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1432 {
1433     double CV_DECL_ALIGNED(32) elems[2] =
1434     {
1435         tab[vgetq_lane_s32(idxvec.val, 0)],
1436         tab[vgetq_lane_s32(idxvec.val, 1)],
1437     };
1438     return v_float64x2(vld1q_f64(elems));
1439 }
1440
1441 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1442 {
1443     int CV_DECL_ALIGNED(32) idx[4];
1444     v_store_aligned(idx, idxvec);
1445
1446     x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1447     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1448 }
1449 #endif
1450
1451 inline void v_cleanup() {}
1452
1453 //! @name Check SIMD support
1454 //! @{
1455 //! @brief Check CPU capability of SIMD operation
1456 static inline bool hasSIMD128()
1457 {
1458     return (CV_CPU_HAS_SUPPORT_NEON) ? true : false;
1459 }
1460
1461 //! @}
1462
1463 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1464
1465 //! @endcond
1466
1467 }
1468
1469 #endif