modules/hal/include/opencv2/hal/intrin_cpp.hpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                          License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17 // Third party copyrights are property of their respective owners.
  18 //
  19 // Redistribution and use in source and binary forms, with or without modification,
  20 // are permitted provided that the following conditions are met:
  21 //
  22 //   * Redistribution's of source code must retain the above copyright notice,
  23 //     this list of conditions and the following disclaimer.
  24 //
  25 //   * Redistribution's in binary form must reproduce the above copyright notice,
  26 //     this list of conditions and the following disclaimer in the documentation
  27 //     and/or other materials provided with the distribution.
  28 //
  29 //   * The name of the copyright holders may not be used to endorse or promote products
  30 //     derived from this software without specific prior written permission.
  31 //
  32 // This software is provided by the copyright holders and contributors "as is" and
  33 // any express or implied warranties, including, but not limited to, the implied
  34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35 // In no event shall the Intel Corporation or contributors be liable for any direct,
  36 // indirect, incidental, special, exemplary, or consequential damages
  37 // (including, but not limited to, procurement of substitute goods or services;
  38 // loss of use, data, or profits; or business interruption) however caused
  39 // and on any theory of liability, whether in contract, strict liability,
  40 // or tort (including negligence or otherwise) arising in any way out of
  41 // the use of this software, even if advised of the possibility of such damage.
  42 //
  43 //M*/
  44
  45 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
  46 #define __OPENCV_HAL_INTRIN_CPP_HPP__
  47
  48 #include <limits>
  49 #include <cstring>
  50
  51 namespace cv
  52 {
  53
  54 /** @addtogroup hal_intrin
  55
  56 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
  57 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
  58 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
  59 containing packed values of different types. In case when there is no SIMD extension available
  60 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
  61 expected although it could be slower.
  62
  63 ### Types
  64
  65 There are several types representing 128-bit register as a vector of packed values, each type is
  66 implemented as a structure based on a one SIMD register.
  67
  68 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
  69 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
  70 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
  71 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
  72 - cv::v_float32x4: four 32-bit floating point values (signed) - float
  73 - cv::v_float64x2: two 64-bit floating point valies (signed) - double
  74
  75 @note
  76 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
  77 check the CV_SIMD128_64F preprocessor definition:
  78 @code
  79 #if CV_SIMD128_64F
  80 //...
  81 #endif
  82 @endcode
  83
  84 ### Load and store operations
  85
  86 These operations allow to set contents of the register explicitly or by loading it from some memory
  87 block and to save contents of the register to memory block.
  88
  89 - Constructors:
  90 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
  91 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
  92 - Other create methods:
  93 @ref v_setall_s8, @ref v_setall_u8, ...,
  94 @ref v_setzero_u8, @ref v_setzero_s8, ...
  95 - Memory operations:
  96 @ref v_load, @ref v_load_aligned, @ref v_load_halves,
  97 @ref v_store, @ref v_store_aligned,
  98 @ref v_store_high, @ref v_store_low
  99
 100 ### Value reordering
 101
 102 These operations allow to reorder or recombine elements in one or multiple vectors.
 103
 104 - Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
 105 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
 106 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
 107 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 108 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
 109 - Extract: @ref v_extract
 110
 111
 112 ### Arithmetic, bitwise and comparison operations
 113
 114 Element-wise binary and unary operations.
 115
 116 - Arithmetics:
 117 @ref operator+(const v_reg &a, const v_reg &b) "+",
 118 @ref operator-(const v_reg &a, const v_reg &b) "-",
 119 @ref operator*(const v_reg &a, const v_reg &b) "*",
 120 @ref operator/(const v_reg &a, const v_reg &b) "/",
 121 @ref v_mul_expand
 122
 123 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
 124
 125 - Bitwise shifts:
 126 @ref operator<<(const v_reg &a, int s) "<<",
 127 @ref operator>>(const v_reg &a, int s) ">>",
 128 @ref v_shl, @ref v_shr
 129
 130 - Bitwise logic:
 131 @ref operator&(const v_reg &a, const v_reg &b) "&",
 132 @ref operator|(const v_reg &a, const v_reg &b) "|",
 133 @ref operator^(const v_reg &a, const v_reg &b) "^",
 134 @ref operator~(const v_reg &a) "~"
 135
 136 - Comparison:
 137 @ref operator>(const v_reg &a, const v_reg &b) ">",
 138 @ref operator>=(const v_reg &a, const v_reg &b) ">=",
 139 @ref operator<(const v_reg &a, const v_reg &b) "<",
 140 @ref operator<=(const v_reg &a, const v_reg &b) "<=",
 141 @ref operator==(const v_reg &a, const v_reg &b) "==",
 142 @ref operator!=(const v_reg &a, const v_reg &b) "!="
 143
 144 - min/max: @ref v_min, @ref v_max
 145
 146 ### Reduce and mask
 147
 148 Most of these operations return only one value.
 149
 150 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
 151 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
 152
 153 ### Other math
 154
 155 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
 156 - Absolute values: @ref v_abs, @ref v_absdiff
 157
 158 ### Conversions
 159
 160 Different type conversions and casts:
 161
 162 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
 163 - To float: @ref v_cvt_f32, @ref v_cvt_f64
 164 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
 165
 166 ### Matrix operations
 167
 168 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
 169
 170 ### Usability
 171
 172 Most operations are implemented only for some subset of the available types, following matrices
 173 shows the applicability of different operations to the types.
 174
 175 Regular integers:
 176
 177 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
 178 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
 179 |load, store        | x | x | x | x | x | x |
 180 |interleave         | x | x | x | x | x | x |
 181 |expand             | x | x | x | x | x | x |
 182 |expand_q           | x | x |   |   |   |   |
 183 |add, sub           | x | x | x | x | x | x |
 184 |add_wrap, sub_wrap | x | x | x | x |   |   |
 185 |mul                |   |   | x | x | x | x |
 186 |mul_expand         |   |   | x | x | x |   |
 187 |compare            | x | x | x | x | x | x |
 188 |shift              |   |   | x | x | x | x |
 189 |dotprod            |   |   |   | x |   |   |
 190 |logical            | x | x | x | x | x | x |
 191 |min, max           | x | x | x | x | x | x |
 192 |absdiff            | x | x | x | x | x | x |
 193 |reduce             |   |   |   |   | x | x |
 194 |mask               | x | x | x | x | x | x |
 195 |pack               | x | x | x | x | x | x |
 196 |pack_u             | x |   | x |   |   |   |
 197 |unpack             | x | x | x | x | x | x |
 198 |extract            | x | x | x | x | x | x |
 199 |cvt_flt32          |   |   |   |   |   | x |
 200 |cvt_flt64          |   |   |   |   |   | x |
 201 |transpose4x4       |   |   |   |   | x | x |
 202
 203 Big integers:
 204
 205 | Operations\\Types | uint 64x2 | int 64x2 |
 206 |-------------------|:-:|:-:|
 207 |load, store        | x | x |
 208 |add, sub           | x | x |
 209 |shift              | x | x |
 210 |logical            | x | x |
 211 |extract            | x | x |
 212
 213 Floating point:
 214
 215 | Operations\\Types | float 32x4 | float 64x2 |
 216 |-------------------|:-:|:-:|
 217 |load, store        | x | x |
 218 |interleave         | x |   |
 219 |add, sub           | x | x |
 220 |mul                | x | x |
 221 |div                | x | x |
 222 |compare            | x | x |
 223 |min, max           | x | x |
 224 |absdiff            | x | x |
 225 |reduce             | x |   |
 226 |mask               | x | x |
 227 |unpack             | x | x |
 228 |cvt_flt32          |   | x |
 229 |cvt_flt64          | x |   |
 230 |sqrt, abs          | x | x |
 231 |float math         | x | x |
 232 |transpose4x4       | x |   |
 233
 234
 235  @{ */
 236
 237 template<typename _Tp, int n> struct v_reg
 238 {
 239 //! @cond IGNORED
 240     typedef _Tp lane_type;
 241     typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
 242     typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
 243     enum { nlanes = n };
 244 // !@endcond
 245
 246     /** @brief Constructor
 247
 248     Initializes register with data from memory
 249     @param ptr pointer to memory block with data for register */
 250     explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
 251
 252     /** @brief Constructor
 253
 254     Initializes register with two 64-bit values */
 255     v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
 256
 257     /** @brief Constructor
 258
 259     Initializes register with four 32-bit values */
 260     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
 261
 262     /** @brief Constructor
 263
 264     Initializes register with eight 16-bit values */
 265     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
 266            _Tp s4, _Tp s5, _Tp s6, _Tp s7)
 267     {
 268         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
 269         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
 270     }
 271
 272     /** @brief Constructor
 273
 274     Initializes register with sixteen 8-bit values */
 275     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
 276            _Tp s4, _Tp s5, _Tp s6, _Tp s7,
 277            _Tp s8, _Tp s9, _Tp s10, _Tp s11,
 278            _Tp s12, _Tp s13, _Tp s14, _Tp s15)
 279     {
 280         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
 281         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
 282         s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
 283         s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
 284     }
 285
 286     /** @brief Default constructor
 287
 288     Does not initialize anything*/
 289     v_reg() {}
 290
 291     /** @brief Copy constructor */
 292     v_reg(const v_reg<_Tp, n> & r)
 293     {
 294         for( int i = 0; i < n; i++ )
 295             s[i] = r.s[i];
 296     }
 297     /** @brief Access first value
 298
 299     Returns value of the first lane according to register type, for example:
 300     @code{.cpp}
 301     v_int32x4 r(1, 2, 3, 4);
 302     int v = r.get0(); // returns 1
 303     v_uint64x2 r(1, 2);
 304     uint64_t v = r.get0(); // returns 1
 305     @endcode
 306     */
 307     _Tp get0() const { return s[0]; }
 308
 309 //! @cond IGNORED
 310     _Tp get(const int i) const { return s[i]; }
 311     v_reg<_Tp, n> high() const
 312     {
 313         v_reg<_Tp, n> c;
 314         int i;
 315         for( i = 0; i < n/2; i++ )
 316         {
 317             c.s[i] = s[i+(n/2)];
 318             c.s[i+(n/2)] = 0;
 319         }
 320         return c;
 321     }
 322
 323     static v_reg<_Tp, n> zero()
 324     {
 325         v_reg<_Tp, n> c;
 326         for( int i = 0; i < n; i++ )
 327             c.s[i] = (_Tp)0;
 328         return c;
 329     }
 330
 331     static v_reg<_Tp, n> all(_Tp s)
 332     {
 333         v_reg<_Tp, n> c;
 334         for( int i = 0; i < n; i++ )
 335             c.s[i] = s;
 336         return c;
 337     }
 338
 339     template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
 340     {
 341         size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
 342         v_reg<_Tp2, n2> c;
 343         std::memcpy(&c.s[0], &s[0], bytes);
 344         return c;
 345     }
 346
 347     _Tp s[n];
 348 //! @endcond
 349 };
 350
 351 /** @brief Sixteen 8-bit unsigned integer values */
 352 typedef v_reg<uchar, 16> v_uint8x16;
 353 /** @brief Sixteen 8-bit signed integer values */
 354 typedef v_reg<schar, 16> v_int8x16;
 355 /** @brief Eight 16-bit unsigned integer values */
 356 typedef v_reg<ushort, 8> v_uint16x8;
 357 /** @brief Eight 16-bit signed integer values */
 358 typedef v_reg<short, 8> v_int16x8;
 359 /** @brief Four 32-bit unsigned integer values */
 360 typedef v_reg<unsigned, 4> v_uint32x4;
 361 /** @brief Four 32-bit signed integer values */
 362 typedef v_reg<int, 4> v_int32x4;
 363 /** @brief Four 32-bit floating point values (single precision) */
 364 typedef v_reg<float, 4> v_float32x4;
 365 /** @brief Two 64-bit floating point values (double precision) */
 366 typedef v_reg<double, 2> v_float64x2;
 367 /** @brief Two 64-bit unsigned integer values */
 368 typedef v_reg<uint64, 2> v_uint64x2;
 369 /** @brief Two 64-bit signed integer values */
 370 typedef v_reg<int64, 2> v_int64x2;
 371
 372 //! @brief Helper macro
 373 //! @ingroup hal_intrin_impl
 374 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
 375 template<typename _Tp, int n> inline v_reg<_Tp, n> \
 376     operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 377 { \
 378     v_reg<_Tp, n> c; \
 379     for( int i = 0; i < n; i++ ) \
 380         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
 381     return c; \
 382 } \
 383 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
 384     operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 385 { \
 386     for( int i = 0; i < n; i++ ) \
 387         a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
 388     return a; \
 389 }
 390
 391 /** @brief Add values
 392
 393 For all types. */
 394 OPENCV_HAL_IMPL_BIN_OP(+)
 395
 396 /** @brief Subtract values
 397
 398 For all types. */
 399 OPENCV_HAL_IMPL_BIN_OP(-)
 400
 401 /** @brief Multiply values
 402
 403 For 16- and 32-bit integer types and floating types. */
 404 OPENCV_HAL_IMPL_BIN_OP(*)
 405
 406 /** @brief Divide values
 407
 408 For floating types only. */
 409 OPENCV_HAL_IMPL_BIN_OP(/)
 410
 411 //! @brief Helper macro
 412 //! @ingroup hal_intrin_impl
 413 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
 414 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
 415     (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 416 { \
 417     v_reg<_Tp, n> c; \
 418     typedef typename V_TypeTraits<_Tp>::int_type itype; \
 419     for( int i = 0; i < n; i++ ) \
 420         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
 421                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
 422     return c; \
 423 } \
 424 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
 425     bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 426 { \
 427     typedef typename V_TypeTraits<_Tp>::int_type itype; \
 428     for( int i = 0; i < n; i++ ) \
 429         a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
 430                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
 431     return a; \
 432 }
 433
 434 /** @brief Bitwise AND
 435
 436 Only for integer types. */
 437 OPENCV_HAL_IMPL_BIT_OP(&)
 438
 439 /** @brief Bitwise OR
 440
 441 Only for integer types. */
 442 OPENCV_HAL_IMPL_BIT_OP(|)
 443
 444 /** @brief Bitwise XOR
 445
 446 Only for integer types.*/
 447 OPENCV_HAL_IMPL_BIT_OP(^)
 448
 449 /** @brief Bitwise NOT
 450
 451 Only for integer types.*/
 452 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
 453 {
 454     v_reg<_Tp, n> c;
 455     for( int i = 0; i < n; i++ )
 456         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
 457         return c;
 458 }
 459
 460 //! @brief Helper macro
 461 //! @ingroup hal_intrin_impl
 462 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
 463 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
 464 { \
 465     v_reg<_Tp2, n> c; \
 466     for( int i = 0; i < n; i++ ) \
 467         c.s[i] = cfunc(a.s[i]); \
 468     return c; \
 469 }
 470
 471 /** @brief Square root of elements
 472
 473 Only for floating point types.*/
 474 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
 475
 476 //! @cond IGNORED
 477 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
 478 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
 479 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
 480 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
 481 //! @endcond
 482
 483 /** @brief Absolute value of elements
 484
 485 Only for floating point types.*/
 486 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
 487                           typename V_TypeTraits<_Tp>::abs_type)
 488
 489 /** @brief Round elements
 490
 491 Only for floating point types.*/
 492 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
 493
 494 /** @brief Floor elements
 495
 496 Only for floating point types.*/
 497 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
 498
 499 /** @brief Ceil elements
 500
 501 Only for floating point types.*/
 502 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
 503
 504 /** @brief Truncate elements
 505
 506 Only for floating point types.*/
 507 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
 508
 509 //! @brief Helper macro
 510 //! @ingroup hal_intrin_impl
 511 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
 512 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 513 { \
 514     v_reg<_Tp, n> c; \
 515     for( int i = 0; i < n; i++ ) \
 516         c.s[i] = cfunc(a.s[i], b.s[i]); \
 517     return c; \
 518 }
 519
 520 //! @brief Helper macro
 521 //! @ingroup hal_intrin_impl
 522 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
 523 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
 524 { \
 525     _Tp c = a.s[0]; \
 526     for( int i = 1; i < n; i++ ) \
 527         c = cfunc(c, a.s[i]); \
 528     return c; \
 529 }
 530
 531 /** @brief Choose min values for each pair
 532
 533 Scheme:
 534 @code
 535 {A1 A2 ...}
 536 {B1 B2 ...}
 537 --------------
 538 {min(A1,B1) min(A2,B2) ...}
 539 @endcode
 540 For all types except 64-bit integer. */
 541 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
 542
 543 /** @brief Choose max values for each pair
 544
 545 Scheme:
 546 @code
 547 {A1 A2 ...}
 548 {B1 B2 ...}
 549 --------------
 550 {max(A1,B1) max(A2,B2) ...}
 551 @endcode
 552 For all types except 64-bit integer. */
 553 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
 554
 555 /** @brief Find one min value
 556
 557 Scheme:
 558 @code
 559 {A1 A2 A3 ...} => min(A1,A2,A3,...)
 560 @endcode
 561 For 32-bit integer and 32-bit floating point types. */
 562 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
 563
 564 /** @brief Find one max value
 565
 566 Scheme:
 567 @code
 568 {A1 A2 A3 ...} => max(A1,A2,A3,...)
 569 @endcode
 570 For 32-bit integer and 32-bit floating point types. */
 571 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
 572
 573 //! @cond IGNORED
 574 template<typename _Tp, int n>
 575 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 576                       v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
 577 {
 578     for( int i = 0; i < n; i++ )
 579     {
 580         minval.s[i] = std::min(a.s[i], b.s[i]);
 581         maxval.s[i] = std::max(a.s[i], b.s[i]);
 582     }
 583 }
 584 //! @endcond
 585
 586 //! @brief Helper macro
 587 //! @ingroup hal_intrin_impl
 588 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
 589 template<typename _Tp, int n> \
 590 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 591 { \
 592     typedef typename V_TypeTraits<_Tp>::int_type itype; \
 593     v_reg<_Tp, n> c; \
 594     for( int i = 0; i < n; i++ ) \
 595         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
 596     return c; \
 597 }
 598
 599 /** @brief Less-than comparison
 600
 601 For all types except 64-bit integer values. */
 602 OPENCV_HAL_IMPL_CMP_OP(<)
 603
 604 /** @brief Greater-than comparison
 605
 606 For all types except 64-bit integer values. */
 607 OPENCV_HAL_IMPL_CMP_OP(>)
 608
 609 /** @brief Less-than or equal comparison
 610
 611 For all types except 64-bit integer values. */
 612 OPENCV_HAL_IMPL_CMP_OP(<=)
 613
 614 /** @brief Greater-than or equal comparison
 615
 616 For all types except 64-bit integer values. */
 617 OPENCV_HAL_IMPL_CMP_OP(>=)
 618
 619 /** @brief Equal comparison
 620
 621 For all types except 64-bit integer values. */
 622 OPENCV_HAL_IMPL_CMP_OP(==)
 623
 624 /** @brief Not equal comparison
 625
 626 For all types except 64-bit integer values. */
 627 OPENCV_HAL_IMPL_CMP_OP(!=)
 628
 629 //! @brief Helper macro
 630 //! @ingroup hal_intrin_impl
 631 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
 632 template<typename _Tp, int n> \
 633 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 634 { \
 635     typedef _Tp2 rtype; \
 636     v_reg<rtype, n> c; \
 637     for( int i = 0; i < n; i++ ) \
 638         c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
 639     return c; \
 640 }
 641
 642 /** @brief Add values without saturation
 643
 644 For 8- and 16-bit integer values. */
 645 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
 646
 647 /** @brief Subtract values without saturation
 648
 649 For 8- and 16-bit integer values. */
 650 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
 651
 652 //! @cond IGNORED
 653 template<typename T> inline T _absdiff(T a, T b)
 654 {
 655     return a > b ? a - b : b - a;
 656 }
 657 //! @endcond
 658
 659 /** @brief Absolute difference
 660
 661 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
 662 Example:
 663 @code{.cpp}
 664 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
 665 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
 666 @endcode
 667 For 8-, 16-, 32-bit integer source types. */
 668 template<typename _Tp, int n>
 669 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
 670 {
 671     typedef typename V_TypeTraits<_Tp>::abs_type rtype;
 672     v_reg<rtype, n> c;
 673     const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0;
 674     for( int i = 0; i < n; i++ )
 675     {
 676         rtype ua = a.s[i] ^ mask;
 677         rtype ub = b.s[i] ^ mask;
 678         c.s[i] = _absdiff(ua, ub);
 679     }
 680     return c;
 681 }
 682
 683 /** @overload
 684
 685 For 32-bit floating point values */
 686 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
 687 {
 688     v_float32x4 c;
 689     for( int i = 0; i < c.nlanes; i++ )
 690         c.s[i] = _absdiff(a.s[i], b.s[i]);
 691     return c;
 692 }
 693
 694 /** @overload
 695
 696 For 64-bit floating point values */
 697 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
 698 {
 699     v_float64x2 c;
 700     for( int i = 0; i < c.nlanes; i++ )
 701         c.s[i] = _absdiff(a.s[i], b.s[i]);
 702     return c;
 703 }
 704
 705 /** @brief Inversed square root
 706
 707 Returns \f$ 1/sqrt(a) \f$
 708 For floating point types only. */
 709 template<typename _Tp, int n>
 710 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
 711 {
 712     v_reg<_Tp, n> c;
 713     for( int i = 0; i < n; i++ )
 714         c.s[i] = 1.f/std::sqrt(a.s[i]);
 715     return c;
 716 }
 717
 718 /** @brief Magnitude
 719
 720 Returns \f$ sqrt(a^2 + b^2) \f$
 721 For floating point types only. */
 722 template<typename _Tp, int n>
 723 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 724 {
 725     v_reg<_Tp, n> c;
 726     for( int i = 0; i < n; i++ )
 727         c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
 728     return c;
 729 }
 730
 731 /** @brief Square of the magnitude
 732
 733 Returns \f$ a^2 + b^2 \f$
 734 For floating point types only. */
 735 template<typename _Tp, int n>
 736 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 737 {
 738     v_reg<_Tp, n> c;
 739     for( int i = 0; i < n; i++ )
 740         c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
 741     return c;
 742 }
 743
 744 /** @brief Multiply and add
 745
 746 Returns \f$ a*b + c \f$
 747 For floating point types only. */
 748 template<typename _Tp, int n>
 749 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 750                               const v_reg<_Tp, n>& c)
 751 {
 752     v_reg<_Tp, n> d;
 753     for( int i = 0; i < n; i++ )
 754         d.s[i] = a.s[i]*b.s[i] + c.s[i];
 755     return d;
 756 }
 757
 758 /** @brief Dot product of elements
 759
 760 Multiply values in two registers and sum adjacent result pairs.
 761 Scheme:
 762 @code
 763   {A1 A2 ...} // 16-bit
 764 x {B1 B2 ...} // 16-bit
 765 -------------
 766 {A1B1+A2B2 ...} // 32-bit
 767 @endcode
 768 Implemented only for 16-bit signed source type (v_int16x8).
 769 */
 770 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
 771     v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 772 {
 773     typedef typename V_TypeTraits<_Tp>::w_type w_type;
 774     v_reg<w_type, n/2> c;
 775     for( int i = 0; i < (n/2); i++ )
 776         c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
 777     return c;
 778 }
 779
 780 /** @brief Multiply and expand
 781
 782 Multiply values two registers and store results in two registers with wider pack type.
 783 Scheme:
 784 @code
 785   {A B C D} // 32-bit
 786 x {E F G H} // 32-bit
 787 ---------------
 788 {AE BF}         // 64-bit
 789         {CG DH} // 64-bit
 790 @endcode
 791 Example:
 792 @code{.cpp}
 793 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
 794 v_uint64x2 c, d; // results
 795 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
 796 @endcode
 797 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
 798 */
 799 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 800                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
 801                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
 802 {
 803     typedef typename V_TypeTraits<_Tp>::w_type w_type;
 804     for( int i = 0; i < (n/2); i++ )
 805     {
 806         c.s[i] = (w_type)a.s[i]*b.s[i];
 807         d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
 808     }
 809 }
 810
 811 //! @cond IGNORED
 812 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
 813                                                  v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
 814 {
 815     typedef typename V_TypeTraits<_Tp>::w_type w_type;
 816     for( int i = 0; i < (n/2); i++ )
 817     {
 818         c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
 819     }
 820 }
 821 //! @endcond
 822
 823 //! @brief Helper macro
 824 //! @ingroup hal_intrin_impl
 825 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
 826 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
 827 { \
 828     v_reg<_Tp, n> c; \
 829     for( int i = 0; i < n; i++ ) \
 830         c.s[i] = (_Tp)(a.s[i] shift_op imm); \
 831     return c; \
 832 }
 833
 834 /** @brief Bitwise shift left
 835
 836 For 16-, 32- and 64-bit integer values. */
 837 OPENCV_HAL_IMPL_SHIFT_OP(<<)
 838
 839 /** @brief Bitwise shift right
 840
 841 For 16-, 32- and 64-bit integer values. */
 842 OPENCV_HAL_IMPL_SHIFT_OP(>>)
 843
 844 /** @brief Sum packed values
 845
 846 Scheme:
 847 @code
 848 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
 849 @endcode
 850 For 32-bit integer and 32-bit floating point types.*/
 851 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
 852 {
 853     typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
 854     for( int i = 1; i < n; i++ )
 855         c += a.s[i];
 856     return c;
 857 }
 858
 859 /** @brief Get negative values mask
 860
 861 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
 862 Example:
 863 @code{.cpp}
 864 v_int32x4 r; // set to {-1, -1, 1, 1}
 865 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
 866 @endcode
 867 For all types except 64-bit. */
 868 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
 869 {
 870     int mask = 0;
 871     for( int i = 0; i < n; i++ )
 872         mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
 873     return mask;
 874 }
 875
 876 /** @brief Check if all packed values are less than zero
 877
 878 Unsigned values will be casted to signed: `uchar 254 => char -2`.
 879 For all types except 64-bit. */
 880 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
 881 {
 882     for( int i = 0; i < n; i++ )
 883         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
 884             return false;
 885     return true;
 886 }
 887
 888 /** @brief Check if any of packed values is less than zero
 889
 890 Unsigned values will be casted to signed: `uchar 254 => char -2`.
 891 For all types except 64-bit. */
 892 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
 893 {
 894     for( int i = 0; i < n; i++ )
 895         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
 896             return true;
 897     return false;
 898 }
 899
 900 /** @brief Bitwise select
 901
 902 Return value will be built by combining values a and b using the following scheme:
 903 If the i-th bit in _mask_ is 1
 904     select i-th bit from _a_
 905 else
 906     select i-th bit from _b_ */
 907 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
 908                                                            const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 909 {
 910     typedef V_TypeTraits<_Tp> Traits;
 911     typedef typename Traits::int_type int_type;
 912     v_reg<_Tp, n> c;
 913     for( int i = 0; i < n; i++ )
 914     {
 915         int_type m = Traits::reinterpret_int(mask.s[i]);
 916         c.s[i] =  Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m)
 917                                              | (Traits::reinterpret_int(b.s[i]) & ~m));
 918     }
 919     return c;
 920 }
 921
 922 /** @brief Expand values to the wider pack type
 923
 924 Copy contents of register to two registers with 2x wider pack type.
 925 Scheme:
 926 @code
 927  int32x4     int64x2 int64x2
 928 {A B C D} ==> {A B} , {C D}
 929 @endcode */
 930 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
 931                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
 932                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
 933 {
 934     for( int i = 0; i < (n/2); i++ )
 935     {
 936         b0.s[i] = a.s[i];
 937         b1.s[i] = a.s[i+(n/2)];
 938     }
 939 }
 940
 941 //! @cond IGNORED
 942 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
 943     v_reinterpret_as_int(const v_reg<_Tp, n>& a)
 944 {
 945     v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
 946     for( int i = 0; i < n; i++ )
 947         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
 948     return c;
 949 }
 950
 951 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
 952     v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
 953 {
 954     v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
 955     for( int i = 0; i < n; i++ )
 956         c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
 957     return c;
 958 }
 959 //! @endcond
 960
 961 /** @brief Interleave two vectors
 962
 963 Scheme:
 964 @code
 965   {A1 A2 A3 A4}
 966   {B1 B2 B3 B4}
 967 ---------------
 968   {A1 B1 A2 B2} and {A3 B3 A4 B4}
 969 @endcode
 970 For all types except 64-bit.
 971 */
 972 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
 973                                                v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
 974 {
 975     int i;
 976     for( i = 0; i < n/2; i++ )
 977     {
 978         b0.s[i*2] = a0.s[i];
 979         b0.s[i*2+1] = a1.s[i];
 980     }
 981     for( ; i < n; i++ )
 982     {
 983         b1.s[i*2-n] = a0.s[i];
 984         b1.s[i*2-n+1] = a1.s[i];
 985     }
 986 }
 987
 988 /** @brief Load register contents from memory
 989
 990 @param ptr pointer to memory block with data
 991 @return register object
 992
 993 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
 994  */
 995 template<typename _Tp>
 996 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
 997 {
 998     return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
 999 }
1000
1001 /** @brief Load register contents from memory (aligned)
1002
1003 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
1004  */
1005 template<typename _Tp>
1006 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
1007 {
1008     return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
1009 }
1010
1011 /** @brief Load register contents from two memory blocks
1012
1013 @param loptr memory block containing data for first half (0..n/2)
1014 @param hiptr memory block containing data for second half (n/2..n)
1015
1016 @code{.cpp}
1017 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
1018 v_int32x4 r = v_load_halves(lo, hi);
1019 @endcode
1020  */
1021 template<typename _Tp>
1022 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1023 {
1024     v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
1025     for( int i = 0; i < c.nlanes/2; i++ )
1026     {
1027         c.s[i] = loptr[i];
1028         c.s[i+c.nlanes/2] = hiptr[i];
1029     }
1030     return c;
1031 }
1032
1033 /** @brief Load register contents from memory with double expand
1034
1035 Same as cv::v_load, but result pack type will be 2x wider than memory type.
1036
1037 @code{.cpp}
1038 short buf[4] = {1, 2, 3, 4}; // type is int16
1039 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
1040 @endcode
1041 For 8-, 16-, 32-bit integer source types. */
1042 template<typename _Tp>
1043 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
1044 v_load_expand(const _Tp* ptr)
1045 {
1046     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1047     v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
1048     for( int i = 0; i < c.nlanes; i++ )
1049     {
1050         c.s[i] = ptr[i];
1051     }
1052     return c;
1053 }
1054
1055 /** @brief Load register contents from memory with quad expand
1056
1057 Same as cv::v_load_expand, but result type is 4 times wider than source.
1058 @code{.cpp}
1059 char buf[4] = {1, 2, 3, 4}; // type is int8
1060 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
1061 @endcode
1062 For 8-bit integer source types. */
1063 template<typename _Tp>
1064 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
1065 v_load_expand_q(const _Tp* ptr)
1066 {
1067     typedef typename V_TypeTraits<_Tp>::q_type q_type;
1068     v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
1069     for( int i = 0; i < c.nlanes; i++ )
1070     {
1071         c.s[i] = ptr[i];
1072     }
1073     return c;
1074 }
1075
1076 /** @brief Load and deinterleave (4 channels)
1077
1078 Load data from memory deinterleave and store to 4 registers.
1079 Scheme:
1080 @code
1081 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
1082 @endcode
1083 For all types except 64-bit. */
1084 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1085                                                             v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
1086 {
1087     int i, i3;
1088     for( i = i3 = 0; i < n; i++, i3 += 3 )
1089     {
1090         a.s[i] = ptr[i3];
1091         b.s[i] = ptr[i3+1];
1092         c.s[i] = ptr[i3+2];
1093     }
1094 }
1095
1096 /** @brief Load and deinterleave (3 channels)
1097
1098 Load data from memory deinterleave and store to 3 registers.
1099 Scheme:
1100 @code
1101 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
1102 @endcode
1103 For all types except 64-bit. */
1104 template<typename _Tp, int n>
1105 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1106                                 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
1107                                 v_reg<_Tp, n>& d)
1108 {
1109     int i, i4;
1110     for( i = i4 = 0; i < n; i++, i4 += 4 )
1111     {
1112         a.s[i] = ptr[i4];
1113         b.s[i] = ptr[i4+1];
1114         c.s[i] = ptr[i4+2];
1115         d.s[i] = ptr[i4+3];
1116     }
1117 }
1118
1119 /** @brief Interleave and store (3 channels)
1120
1121 Interleave and store data from 3 registers to memory.
1122 Scheme:
1123 @code
1124 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
1125 @endcode
1126 For all types except 64-bit. */
1127 template<typename _Tp, int n>
1128 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1129                                 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
1130 {
1131     int i, i3;
1132     for( i = i3 = 0; i < n; i++, i3 += 3 )
1133     {
1134         ptr[i3] = a.s[i];
1135         ptr[i3+1] = b.s[i];
1136         ptr[i3+2] = c.s[i];
1137     }
1138 }
1139
1140 /** @brief Interleave and store (4 channels)
1141
1142 Interleave and store data from 4 registers to memory.
1143 Scheme:
1144 @code
1145 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
1146 @endcode
1147 For all types except 64-bit. */
1148 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1149                                                             const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
1150                                                             const v_reg<_Tp, n>& d)
1151 {
1152     int i, i4;
1153     for( i = i4 = 0; i < n; i++, i4 += 4 )
1154     {
1155         ptr[i4] = a.s[i];
1156         ptr[i4+1] = b.s[i];
1157         ptr[i4+2] = c.s[i];
1158         ptr[i4+3] = d.s[i];
1159     }
1160 }
1161
1162 /** @brief Store data to memory
1163
1164 Store register contents to memory.
1165 Scheme:
1166 @code
1167   REG {A B C D} ==> MEM {A B C D}
1168 @endcode
1169 Pointer can be unaligned. */
1170 template<typename _Tp, int n>
1171 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
1172 {
1173     for( int i = 0; i < n; i++ )
1174         ptr[i] = a.s[i];
1175 }
1176
1177 /** @brief Store data to memory (lower half)
1178
1179 Store lower half of register contents to memory.
1180 Scheme:
1181 @code
1182   REG {A B C D} ==> MEM {A B}
1183 @endcode */
1184 template<typename _Tp, int n>
1185 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
1186 {
1187     for( int i = 0; i < (n/2); i++ )
1188         ptr[i] = a.s[i];
1189 }
1190
1191 /** @brief Store data to memory (higher half)
1192
1193 Store higher half of register contents to memory.
1194 Scheme:
1195 @code
1196   REG {A B C D} ==> MEM {C D}
1197 @endcode */
1198 template<typename _Tp, int n>
1199 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
1200 {
1201     for( int i = 0; i < (n/2); i++ )
1202         ptr[i] = a.s[i+(n/2)];
1203 }
1204
1205 /** @brief Store data to memory (aligned)
1206
1207 Store register contents to memory.
1208 Scheme:
1209 @code
1210   REG {A B C D} ==> MEM {A B C D}
1211 @endcode
1212 Pointer __should__ be aligned by 16-byte boundary. */
1213 template<typename _Tp, int n>
1214 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
1215 {
1216     for( int i = 0; i < n; i++ )
1217         ptr[i] = a.s[i];
1218 }
1219
1220 /** @brief Combine vector from first elements of two vectors
1221
1222 Scheme:
1223 @code
1224   {A1 A2 A3 A4}
1225   {B1 B2 B3 B4}
1226 ---------------
1227   {A1 A2 B1 B2}
1228 @endcode
1229 For all types except 64-bit. */
1230 template<typename _Tp, int n>
1231 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1232 {
1233     v_reg<_Tp, n> c;
1234     for( int i = 0; i < (n/2); i++ )
1235     {
1236         c.s[i] = a.s[i];
1237         c.s[i+(n/2)] = b.s[i];
1238     }
1239     return c;
1240 }
1241
1242 /** @brief Combine vector from last elements of two vectors
1243
1244 Scheme:
1245 @code
1246   {A1 A2 A3 A4}
1247   {B1 B2 B3 B4}
1248 ---------------
1249   {A3 A4 B3 B4}
1250 @endcode
1251 For all types except 64-bit. */
1252 template<typename _Tp, int n>
1253 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1254 {
1255     v_reg<_Tp, n> c;
1256     for( int i = 0; i < (n/2); i++ )
1257     {
1258         c.s[i] = a.s[i+(n/2)];
1259         c.s[i+(n/2)] = b.s[i+(n/2)];
1260     }
1261     return c;
1262 }
1263
1264 /** @brief Combine two vectors from lower and higher parts of two other vectors
1265
1266 @code{.cpp}
1267 low = cv::v_combine_low(a, b);
1268 high = cv::v_combine_high(a, b);
1269 @endcode */
1270 template<typename _Tp, int n>
1271 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1272                         v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
1273 {
1274     for( int i = 0; i < (n/2); i++ )
1275     {
1276         low.s[i] = a.s[i];
1277         low.s[i+(n/2)] = b.s[i];
1278         high.s[i] = a.s[i+(n/2)];
1279         high.s[i+(n/2)] = b.s[i+(n/2)];
1280     }
1281 }
1282
1283 /** @brief Vector extract
1284
1285 Scheme:
1286 @code
1287   {A1 A2 A3 A4}
1288   {B1 B2 B3 B4}
1289 ========================
1290 shift = 1  {A2 A3 A4 B1}
1291 shift = 2  {A3 A4 B1 B2}
1292 shift = 3  {A4 B1 B2 B3}
1293 @endcode
1294 Restriction: 0 <= shift < nlanes
1295
1296 Usage:
1297 @code
1298 v_int32x4 a, b, c;
1299 c = v_extract<2>(a, b);
1300 @endcode
1301 For integer types only. */
1302 template<int s, typename _Tp, int n>
1303 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1304 {
1305     v_reg<_Tp, n> r;
1306     const int shift = n - s;
1307     int i = 0;
1308     for (; i < shift; ++i)
1309         r.s[i] = a.s[i+s];
1310     for (; i < n; ++i)
1311         r.s[i] = b.s[i-shift];
1312     return r;
1313 }
1314
1315 /** @brief Round
1316
1317 Rounds each value. Input type is float vector ==> output type is int vector.*/
1318 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
1319 {
1320     v_reg<int, n> c;
1321     for( int i = 0; i < n; i++ )
1322         c.s[i] = cvRound(a.s[i]);
1323     return c;
1324 }
1325
1326 /** @brief Floor
1327
1328 Floor each value. Input type is float vector ==> output type is int vector.*/
1329 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
1330 {
1331     v_reg<int, n> c;
1332     for( int i = 0; i < n; i++ )
1333         c.s[i] = cvFloor(a.s[i]);
1334     return c;
1335 }
1336
1337 /** @brief Ceil
1338
1339 Ceil each value. Input type is float vector ==> output type is int vector.*/
1340 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
1341 {
1342     v_reg<int, n> c;
1343     for( int i = 0; i < n; i++ )
1344         c.s[i] = cvCeil(a.s[i]);
1345     return c;
1346 }
1347
1348 /** @brief Trunc
1349
1350 Truncate each value. Input type is float vector ==> output type is int vector.*/
1351 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
1352 {
1353     v_reg<int, n> c;
1354     for( int i = 0; i < n; i++ )
1355         c.s[i] = (int)(a.s[i]);
1356     return c;
1357 }
1358
1359 /** @overload */
1360 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
1361 {
1362     v_reg<int, n*2> c;
1363     for( int i = 0; i < n; i++ )
1364     {
1365         c.s[i] = cvRound(a.s[i]);
1366         c.s[i+n] = 0;
1367     }
1368     return c;
1369 }
1370
1371 /** @overload */
1372 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
1373 {
1374     v_reg<int, n> c;
1375     for( int i = 0; i < n; i++ )
1376     {
1377         c.s[i] = cvFloor(a.s[i]);
1378         c.s[i+n] = 0;
1379     }
1380     return c;
1381 }
1382
1383 /** @overload */
1384 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
1385 {
1386     v_reg<int, n> c;
1387     for( int i = 0; i < n; i++ )
1388     {
1389         c.s[i] = cvCeil(a.s[i]);
1390         c.s[i+n] = 0;
1391     }
1392     return c;
1393 }
1394
1395 /** @overload */
1396 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
1397 {
1398     v_reg<int, n> c;
1399     for( int i = 0; i < n; i++ )
1400     {
1401         c.s[i] = cvCeil(a.s[i]);
1402         c.s[i+n] = 0;
1403     }
1404     return c;
1405 }
1406
1407 /** @brief Convert to float
1408
1409 Supported input type is cv::v_int32x4. */
1410 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
1411 {
1412     v_reg<float, n> c;
1413     for( int i = 0; i < n; i++ )
1414         c.s[i] = (float)a.s[i];
1415     return c;
1416 }
1417
1418 /** @brief Convert to double
1419
1420 Supported input type is cv::v_int32x4. */
1421 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
1422 {
1423     v_reg<double, n> c;
1424     for( int i = 0; i < n; i++ )
1425         c.s[i] = (double)a.s[i];
1426     return c;
1427 }
1428
1429 /** @brief Convert to double
1430
1431 Supported input type is cv::v_float32x4. */
1432 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
1433 {
1434     v_reg<double, n> c;
1435     for( int i = 0; i < n; i++ )
1436         c.s[i] = (double)a.s[i];
1437     return c;
1438 }
1439
1440 /** @brief Transpose 4x4 matrix
1441
1442 Scheme:
1443 @code
1444 a0  {A1 A2 A3 A4}
1445 a1  {B1 B2 B3 B4}
1446 a2  {C1 C2 C3 C4}
1447 a3  {D1 D2 D3 D4}
1448 ===============
1449 b0  {A1 B1 C1 D1}
1450 b1  {A2 B2 C2 D2}
1451 b2  {A3 B3 C3 D3}
1452 b3  {A4 B4 C4 D4}
1453 @endcode
1454 */
1455 template<typename _Tp>
1456 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
1457                             const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
1458                             v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
1459                             v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
1460 {
1461     b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
1462     b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
1463     b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
1464     b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
1465 }
1466
1467 //! @brief Helper macro
1468 //! @ingroup hal_intrin_impl
1469 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
1470 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
1471
1472 //! @name Init with zero
1473 //! @{
1474 //! @brief Create new vector with zero elements
1475 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
1476 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
1477 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
1478 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
1479 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
1480 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
1481 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
1482 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
1483 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
1484 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
1485 //! @}
1486
1487 //! @brief Helper macro
1488 //! @ingroup hal_intrin_impl
1489 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
1490 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
1491
1492 //! @name Init with value
1493 //! @{
1494 //! @brief Create new vector with elements set to a specific value
1495 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
1496 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
1497 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
1498 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
1499 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
1500 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
1501 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
1502 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
1503 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
1504 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
1505 //! @}
1506
1507 //! @brief Helper macro
1508 //! @ingroup hal_intrin_impl
1509 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
1510 template<typename _Tp0, int n0> inline _Tpvec \
1511     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
1512 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
1513
1514 //! @name Reinterpret
1515 //! @{
1516 //! @brief Convert vector to different type without modifying underlying data.
1517 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
1518 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
1519 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
1520 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
1521 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
1522 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
1523 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
1524 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
1525 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
1526 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
1527 //! @}
1528
1529 //! @brief Helper macro
1530 //! @ingroup hal_intrin_impl
1531 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
1532 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1533 { return a << n; }
1534
1535 //! @name Left shift
1536 //! @{
1537 //! @brief Shift left
1538 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
1539 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
1540 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
1541 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
1542 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
1543 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
1544 //! @}
1545
1546 //! @brief Helper macro
1547 //! @ingroup hal_intrin_impl
1548 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
1549 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1550 { return a >> n; }
1551
1552 //! @name Right shift
1553 //! @{
1554 //! @brief Shift right
1555 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
1556 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
1557 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
1558 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
1559 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
1560 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
1561 //! @}
1562
1563 //! @brief Helper macro
1564 //! @ingroup hal_intrin_impl
1565 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
1566 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1567 { \
1568     _Tpvec c; \
1569     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1570         c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1571     return c; \
1572 }
1573
1574 //! @name Rounding shift
1575 //! @{
1576 //! @brief Rounding shift right
1577 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
1578 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
1579 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
1580 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
1581 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
1582 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
1583 //! @}
1584
1585 //! @brief Helper macro
1586 //! @ingroup hal_intrin_impl
1587 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \
1588 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1589 { \
1590     _Tpnvec c; \
1591     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1592     { \
1593         c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
1594         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
1595     } \
1596     return c; \
1597 }
1598
1599 //! @name Pack
1600 //! @{
1601 //! @brief Pack values from two vectors to one
1602 //!
1603 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
1604 //! converts to corresponding unsigned type.
1605 //!
1606 //! - pack: for 16-, 32- and 64-bit integer input types
1607 //! - pack_u: for 16- and 32-bit signed integer input types
1608 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack)
1609 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack)
1610 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack)
1611 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack)
1612 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack)
1613 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack)
1614 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u)
1615 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u)
1616 //! @}
1617
1618 //! @brief Helper macro
1619 //! @ingroup hal_intrin_impl
1620 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
1621 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1622 { \
1623     _Tpnvec c; \
1624     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1625     { \
1626         c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1627         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1628     } \
1629     return c; \
1630 }
1631
1632 //! @name Pack with rounding shift
1633 //! @{
1634 //! @brief Pack values from two vectors to one with rounding shift
1635 //!
1636 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
1637 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
1638 //!
1639 //! - pack: for 16-, 32- and 64-bit integer input types
1640 //! - pack_u: for 16- and 32-bit signed integer input types
1641 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
1642 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack)
1643 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
1644 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack)
1645 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
1646 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack)
1647 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
1648 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
1649 //! @}
1650
1651 //! @brief Helper macro
1652 //! @ingroup hal_intrin_impl
1653 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
1654 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1655 { \
1656     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1657         ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
1658 }
1659
1660 //! @name Pack and store
1661 //! @{
1662 //! @brief Store values from the input vector into memory with pack
1663 //!
1664 //! Values will be stored into memory with saturating conversion to narrower type.
1665 //! Variant with _u_ suffix converts to corresponding unsigned type.
1666 //!
1667 //! - pack: for 16-, 32- and 64-bit integer input types
1668 //! - pack_u: for 16- and 32-bit signed integer input types
1669 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
1670 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
1671 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
1672 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
1673 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
1674 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
1675 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
1676 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
1677 //! @}
1678
1679 //! @brief Helper macro
1680 //! @ingroup hal_intrin_impl
1681 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
1682 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1683 { \
1684     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1685         ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1686 }
1687
1688 //! @name Pack and store with rounding shift
1689 //! @{
1690 //! @brief Store values from the input vector into memory with pack
1691 //!
1692 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
1693 //! memory. Variant with _u_ suffix converts to unsigned type.
1694 //!
1695 //! - pack: for 16-, 32- and 64-bit integer input types
1696 //! - pack_u: for 16- and 32-bit signed integer input types
1697 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
1698 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
1699 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
1700 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
1701 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
1702 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
1703 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
1704 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
1705 //! @}
1706
1707 /** @brief Matrix multiplication
1708
1709 Scheme:
1710 @code
1711 {A0 A1 A2 A3}   |V0|
1712 {B0 B1 B2 B3}   |V1|
1713 {C0 C1 C2 C3}   |V2|
1714 {D0 D1 D2 D3} x |V3|
1715 ====================
1716 {R0 R1 R2 R3}, where:
1717 R0 = A0V0 + A1V1 + A2V2 + A3V3,
1718 R1 = B0V0 + B1V1 + B2V2 + B3V3
1719 ...
1720 @endcode
1721 */
1722 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
1723                             const v_float32x4& m1, const v_float32x4& m2,
1724                             const v_float32x4& m3)
1725 {
1726     return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
1727                        v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
1728                        v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
1729                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
1730 }
1731
1732 //! @}
1733
1734 }
1735
1736 #endif