1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
46 #define __OPENCV_HAL_INTRIN_CPP_HPP__
54 /** @addtogroup hal_intrin
56 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
57 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
58 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
59 containing packed values of different types. In case when there is no SIMD extension available
60 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
61 expected although it could be slower.
65 There are several types representing 128-bit register as a vector of packed values, each type is
66 implemented as a structure based on a one SIMD register.
68 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
69 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
70 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
71 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
72 - cv::v_float32x4: four 32-bit floating point values (signed) - float
73 - cv::v_float64x2: two 64-bit floating point valies (signed) - double
76 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
77 check the CV_SIMD128_64F preprocessor definition:
84 ### Load and store operations
86 These operations allow to set contents of the register explicitly or by loading it from some memory
87 block and to save contents of the register to memory block.
90 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
91 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
92 - Other create methods:
93 @ref v_setall_s8, @ref v_setall_u8, ...,
94 @ref v_setzero_u8, @ref v_setzero_s8, ...
96 @ref v_load, @ref v_load_aligned, @ref v_load_halves,
97 @ref v_store, @ref v_store_aligned,
98 @ref v_store_high, @ref v_store_low
102 These operations allow to reorder or recombine elements in one or multiple vectors.
104 - Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
105 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
106 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
107 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
108 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
109 - Extract: @ref v_extract
112 ### Arithmetic, bitwise and comparison operations
114 Element-wise binary and unary operations.
117 @ref operator+(const v_reg &a, const v_reg &b) "+",
118 @ref operator-(const v_reg &a, const v_reg &b) "-",
119 @ref operator*(const v_reg &a, const v_reg &b) "*",
120 @ref operator/(const v_reg &a, const v_reg &b) "/",
123 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
126 @ref operator<<(const v_reg &a, int s) "<<",
127 @ref operator>>(const v_reg &a, int s) ">>",
128 @ref v_shl, @ref v_shr
131 @ref operator&(const v_reg &a, const v_reg &b) "&",
132 @ref operator|(const v_reg &a, const v_reg &b) "|",
133 @ref operator^(const v_reg &a, const v_reg &b) "^",
134 @ref operator~(const v_reg &a) "~"
137 @ref operator>(const v_reg &a, const v_reg &b) ">",
138 @ref operator>=(const v_reg &a, const v_reg &b) ">=",
139 @ref operator<(const v_reg &a, const v_reg &b) "<",
140 @ref operator<=(const v_reg &a, const v_reg &b) "<=",
141 @ref operator==(const v_reg &a, const v_reg &b) "==",
142 @ref operator!=(const v_reg &a, const v_reg &b) "!="
144 - min/max: @ref v_min, @ref v_max
148 Most of these operations return only one value.
150 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
151 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
155 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
156 - Absolute values: @ref v_abs, @ref v_absdiff
160 Different type conversions and casts:
162 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
163 - To float: @ref v_cvt_f32, @ref v_cvt_f64
164 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
166 ### Matrix operations
168 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
172 Most operations are implemented only for some subset of the available types, following matrices
173 shows the applicability of different operations to the types.
177 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
178 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
179 |load, store | x | x | x | x | x | x |
180 |interleave | x | x | x | x | x | x |
181 |expand | x | x | x | x | x | x |
182 |expand_q | x | x | | | | |
183 |add, sub | x | x | x | x | x | x |
184 |add_wrap, sub_wrap | x | x | x | x | | |
185 |mul | | | x | x | x | x |
186 |mul_expand | | | x | x | x | |
187 |compare | x | x | x | x | x | x |
188 |shift | | | x | x | x | x |
189 |dotprod | | | | x | | |
190 |logical | x | x | x | x | x | x |
191 |min, max | x | x | x | x | x | x |
192 |absdiff | x | x | x | x | x | x |
193 |reduce | | | | | x | x |
194 |mask | x | x | x | x | x | x |
195 |pack | x | x | x | x | x | x |
196 |pack_u | x | | x | | | |
197 |unpack | x | x | x | x | x | x |
198 |extract | x | x | x | x | x | x |
199 |cvt_flt32 | | | | | | x |
200 |cvt_flt64 | | | | | | x |
201 |transpose4x4 | | | | | x | x |
205 | Operations\\Types | uint 64x2 | int 64x2 |
206 |-------------------|:-:|:-:|
207 |load, store | x | x |
215 | Operations\\Types | float 32x4 | float 64x2 |
216 |-------------------|:-:|:-:|
217 |load, store | x | x |
231 |float math | x | x |
232 |transpose4x4 | x | |
237 template<typename _Tp, int n> struct v_reg
240 typedef _Tp lane_type;
241 typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
242 typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
246 /** @brief Constructor
248 Initializes register with data from memory
249 @param ptr pointer to memory block with data for register */
250 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
252 /** @brief Constructor
254 Initializes register with two 64-bit values */
255 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
257 /** @brief Constructor
259 Initializes register with four 32-bit values */
260 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
262 /** @brief Constructor
264 Initializes register with eight 16-bit values */
265 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
266 _Tp s4, _Tp s5, _Tp s6, _Tp s7)
268 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
269 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
272 /** @brief Constructor
274 Initializes register with sixteen 8-bit values */
275 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
276 _Tp s4, _Tp s5, _Tp s6, _Tp s7,
277 _Tp s8, _Tp s9, _Tp s10, _Tp s11,
278 _Tp s12, _Tp s13, _Tp s14, _Tp s15)
280 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
281 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
282 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
283 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
286 /** @brief Default constructor
288 Does not initialize anything*/
291 /** @brief Copy constructor */
292 v_reg(const v_reg<_Tp, n> & r)
294 for( int i = 0; i < n; i++ )
297 /** @brief Access first value
299 Returns value of the first lane according to register type, for example:
301 v_int32x4 r(1, 2, 3, 4);
302 int v = r.get0(); // returns 1
304 uint64_t v = r.get0(); // returns 1
307 _Tp get0() const { return s[0]; }
310 _Tp get(const int i) const { return s[i]; }
311 v_reg<_Tp, n> high() const
315 for( i = 0; i < n/2; i++ )
323 static v_reg<_Tp, n> zero()
326 for( int i = 0; i < n; i++ )
331 static v_reg<_Tp, n> all(_Tp s)
334 for( int i = 0; i < n; i++ )
339 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
341 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
343 std::memcpy(&c.s[0], &s[0], bytes);
351 /** @brief Sixteen 8-bit unsigned integer values */
352 typedef v_reg<uchar, 16> v_uint8x16;
353 /** @brief Sixteen 8-bit signed integer values */
354 typedef v_reg<schar, 16> v_int8x16;
355 /** @brief Eight 16-bit unsigned integer values */
356 typedef v_reg<ushort, 8> v_uint16x8;
357 /** @brief Eight 16-bit signed integer values */
358 typedef v_reg<short, 8> v_int16x8;
359 /** @brief Four 32-bit unsigned integer values */
360 typedef v_reg<unsigned, 4> v_uint32x4;
361 /** @brief Four 32-bit signed integer values */
362 typedef v_reg<int, 4> v_int32x4;
363 /** @brief Four 32-bit floating point values (single precision) */
364 typedef v_reg<float, 4> v_float32x4;
365 /** @brief Two 64-bit floating point values (double precision) */
366 typedef v_reg<double, 2> v_float64x2;
367 /** @brief Two 64-bit unsigned integer values */
368 typedef v_reg<uint64, 2> v_uint64x2;
369 /** @brief Two 64-bit signed integer values */
370 typedef v_reg<int64, 2> v_int64x2;
372 //! @brief Helper macro
373 //! @ingroup hal_intrin_impl
374 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
375 template<typename _Tp, int n> inline v_reg<_Tp, n> \
376 operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
379 for( int i = 0; i < n; i++ ) \
380 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
383 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
384 operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
386 for( int i = 0; i < n; i++ ) \
387 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
391 /** @brief Add values
394 OPENCV_HAL_IMPL_BIN_OP(+)
396 /** @brief Subtract values
399 OPENCV_HAL_IMPL_BIN_OP(-)
401 /** @brief Multiply values
403 For 16- and 32-bit integer types and floating types. */
404 OPENCV_HAL_IMPL_BIN_OP(*)
406 /** @brief Divide values
408 For floating types only. */
409 OPENCV_HAL_IMPL_BIN_OP(/)
411 //! @brief Helper macro
412 //! @ingroup hal_intrin_impl
413 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
414 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
415 (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
418 typedef typename V_TypeTraits<_Tp>::int_type itype; \
419 for( int i = 0; i < n; i++ ) \
420 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
421 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
424 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
425 bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
427 typedef typename V_TypeTraits<_Tp>::int_type itype; \
428 for( int i = 0; i < n; i++ ) \
429 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
430 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
434 /** @brief Bitwise AND
436 Only for integer types. */
437 OPENCV_HAL_IMPL_BIT_OP(&)
439 /** @brief Bitwise OR
441 Only for integer types. */
442 OPENCV_HAL_IMPL_BIT_OP(|)
444 /** @brief Bitwise XOR
446 Only for integer types.*/
447 OPENCV_HAL_IMPL_BIT_OP(^)
449 /** @brief Bitwise NOT
451 Only for integer types.*/
452 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
455 for( int i = 0; i < n; i++ )
456 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
460 //! @brief Helper macro
461 //! @ingroup hal_intrin_impl
462 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
463 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
466 for( int i = 0; i < n; i++ ) \
467 c.s[i] = cfunc(a.s[i]); \
471 /** @brief Square root of elements
473 Only for floating point types.*/
474 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
477 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
478 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
479 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
480 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
483 /** @brief Absolute value of elements
485 Only for floating point types.*/
486 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
487 typename V_TypeTraits<_Tp>::abs_type)
489 /** @brief Round elements
491 Only for floating point types.*/
492 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
494 /** @brief Floor elements
496 Only for floating point types.*/
497 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
499 /** @brief Ceil elements
501 Only for floating point types.*/
502 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
504 /** @brief Truncate elements
506 Only for floating point types.*/
507 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
509 //! @brief Helper macro
510 //! @ingroup hal_intrin_impl
511 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
512 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
515 for( int i = 0; i < n; i++ ) \
516 c.s[i] = cfunc(a.s[i], b.s[i]); \
520 //! @brief Helper macro
521 //! @ingroup hal_intrin_impl
522 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
523 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
526 for( int i = 1; i < n; i++ ) \
527 c = cfunc(c, a.s[i]); \
531 /** @brief Choose min values for each pair
538 {min(A1,B1) min(A2,B2) ...}
540 For all types except 64-bit integer. */
541 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
543 /** @brief Choose max values for each pair
550 {max(A1,B1) max(A2,B2) ...}
552 For all types except 64-bit integer. */
553 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
555 /** @brief Find one min value
559 {A1 A2 A3 ...} => min(A1,A2,A3,...)
561 For 32-bit integer and 32-bit floating point types. */
562 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
564 /** @brief Find one max value
568 {A1 A2 A3 ...} => max(A1,A2,A3,...)
570 For 32-bit integer and 32-bit floating point types. */
571 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
574 template<typename _Tp, int n>
575 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
576 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
578 for( int i = 0; i < n; i++ )
580 minval.s[i] = std::min(a.s[i], b.s[i]);
581 maxval.s[i] = std::max(a.s[i], b.s[i]);
586 //! @brief Helper macro
587 //! @ingroup hal_intrin_impl
588 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
589 template<typename _Tp, int n> \
590 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
592 typedef typename V_TypeTraits<_Tp>::int_type itype; \
594 for( int i = 0; i < n; i++ ) \
595 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
599 /** @brief Less-than comparison
601 For all types except 64-bit integer values. */
602 OPENCV_HAL_IMPL_CMP_OP(<)
604 /** @brief Greater-than comparison
606 For all types except 64-bit integer values. */
607 OPENCV_HAL_IMPL_CMP_OP(>)
609 /** @brief Less-than or equal comparison
611 For all types except 64-bit integer values. */
612 OPENCV_HAL_IMPL_CMP_OP(<=)
614 /** @brief Greater-than or equal comparison
616 For all types except 64-bit integer values. */
617 OPENCV_HAL_IMPL_CMP_OP(>=)
619 /** @brief Equal comparison
621 For all types except 64-bit integer values. */
622 OPENCV_HAL_IMPL_CMP_OP(==)
624 /** @brief Not equal comparison
626 For all types except 64-bit integer values. */
627 OPENCV_HAL_IMPL_CMP_OP(!=)
629 //! @brief Helper macro
630 //! @ingroup hal_intrin_impl
631 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
632 template<typename _Tp, int n> \
633 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
635 typedef _Tp2 rtype; \
637 for( int i = 0; i < n; i++ ) \
638 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
642 /** @brief Add values without saturation
644 For 8- and 16-bit integer values. */
645 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
647 /** @brief Subtract values without saturation
649 For 8- and 16-bit integer values. */
650 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
653 template<typename T> inline T _absdiff(T a, T b)
655 return a > b ? a - b : b - a;
659 /** @brief Absolute difference
661 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
664 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
665 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
667 For 8-, 16-, 32-bit integer source types. */
668 template<typename _Tp, int n>
669 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
671 typedef typename V_TypeTraits<_Tp>::abs_type rtype;
673 const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0;
674 for( int i = 0; i < n; i++ )
676 rtype ua = a.s[i] ^ mask;
677 rtype ub = b.s[i] ^ mask;
678 c.s[i] = _absdiff(ua, ub);
685 For 32-bit floating point values */
686 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
689 for( int i = 0; i < c.nlanes; i++ )
690 c.s[i] = _absdiff(a.s[i], b.s[i]);
696 For 64-bit floating point values */
697 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
700 for( int i = 0; i < c.nlanes; i++ )
701 c.s[i] = _absdiff(a.s[i], b.s[i]);
705 /** @brief Inversed square root
707 Returns \f$ 1/sqrt(a) \f$
708 For floating point types only. */
709 template<typename _Tp, int n>
710 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
713 for( int i = 0; i < n; i++ )
714 c.s[i] = 1.f/std::sqrt(a.s[i]);
720 Returns \f$ sqrt(a^2 + b^2) \f$
721 For floating point types only. */
722 template<typename _Tp, int n>
723 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
726 for( int i = 0; i < n; i++ )
727 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
731 /** @brief Square of the magnitude
733 Returns \f$ a^2 + b^2 \f$
734 For floating point types only. */
735 template<typename _Tp, int n>
736 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
739 for( int i = 0; i < n; i++ )
740 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
744 /** @brief Multiply and add
746 Returns \f$ a*b + c \f$
747 For floating point types only. */
748 template<typename _Tp, int n>
749 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
750 const v_reg<_Tp, n>& c)
753 for( int i = 0; i < n; i++ )
754 d.s[i] = a.s[i]*b.s[i] + c.s[i];
758 /** @brief Dot product of elements
760 Multiply values in two registers and sum adjacent result pairs.
763 {A1 A2 ...} // 16-bit
764 x {B1 B2 ...} // 16-bit
766 {A1B1+A2B2 ...} // 32-bit
768 Implemented only for 16-bit signed source type (v_int16x8).
770 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
771 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
773 typedef typename V_TypeTraits<_Tp>::w_type w_type;
774 v_reg<w_type, n/2> c;
775 for( int i = 0; i < (n/2); i++ )
776 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
780 /** @brief Multiply and expand
782 Multiply values two registers and store results in two registers with wider pack type.
786 x {E F G H} // 32-bit
793 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
794 v_uint64x2 c, d; // results
795 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
797 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
799 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
800 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
801 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
803 typedef typename V_TypeTraits<_Tp>::w_type w_type;
804 for( int i = 0; i < (n/2); i++ )
806 c.s[i] = (w_type)a.s[i]*b.s[i];
807 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
812 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
813 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
815 typedef typename V_TypeTraits<_Tp>::w_type w_type;
816 for( int i = 0; i < (n/2); i++ )
818 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
823 //! @brief Helper macro
824 //! @ingroup hal_intrin_impl
825 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
826 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
829 for( int i = 0; i < n; i++ ) \
830 c.s[i] = (_Tp)(a.s[i] shift_op imm); \
834 /** @brief Bitwise shift left
836 For 16-, 32- and 64-bit integer values. */
837 OPENCV_HAL_IMPL_SHIFT_OP(<<)
839 /** @brief Bitwise shift right
841 For 16-, 32- and 64-bit integer values. */
842 OPENCV_HAL_IMPL_SHIFT_OP(>>)
844 /** @brief Sum packed values
848 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
850 For 32-bit integer and 32-bit floating point types.*/
851 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
853 typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
854 for( int i = 1; i < n; i++ )
859 /** @brief Get negative values mask
861 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
864 v_int32x4 r; // set to {-1, -1, 1, 1}
865 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
867 For all types except 64-bit. */
868 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
871 for( int i = 0; i < n; i++ )
872 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
876 /** @brief Check if all packed values are less than zero
878 Unsigned values will be casted to signed: `uchar 254 => char -2`.
879 For all types except 64-bit. */
880 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
882 for( int i = 0; i < n; i++ )
883 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
888 /** @brief Check if any of packed values is less than zero
890 Unsigned values will be casted to signed: `uchar 254 => char -2`.
891 For all types except 64-bit. */
892 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
894 for( int i = 0; i < n; i++ )
895 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
900 /** @brief Bitwise select
902 Return value will be built by combining values a and b using the following scheme:
903 If the i-th bit in _mask_ is 1
904 select i-th bit from _a_
906 select i-th bit from _b_ */
907 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
908 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
910 typedef V_TypeTraits<_Tp> Traits;
911 typedef typename Traits::int_type int_type;
913 for( int i = 0; i < n; i++ )
915 int_type m = Traits::reinterpret_int(mask.s[i]);
916 c.s[i] = Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m)
917 | (Traits::reinterpret_int(b.s[i]) & ~m));
922 /** @brief Expand values to the wider pack type
924 Copy contents of register to two registers with 2x wider pack type.
927 int32x4 int64x2 int64x2
928 {A B C D} ==> {A B} , {C D}
930 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
931 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
932 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
934 for( int i = 0; i < (n/2); i++ )
937 b1.s[i] = a.s[i+(n/2)];
942 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
943 v_reinterpret_as_int(const v_reg<_Tp, n>& a)
945 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
946 for( int i = 0; i < n; i++ )
947 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
951 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
952 v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
954 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
955 for( int i = 0; i < n; i++ )
956 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
961 /** @brief Interleave two vectors
968 {A1 B1 A2 B2} and {A3 B3 A4 B4}
970 For all types except 64-bit.
972 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
973 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
976 for( i = 0; i < n/2; i++ )
979 b0.s[i*2+1] = a1.s[i];
983 b1.s[i*2-n] = a0.s[i];
984 b1.s[i*2-n+1] = a1.s[i];
988 /** @brief Load register contents from memory
990 @param ptr pointer to memory block with data
991 @return register object
993 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
995 template<typename _Tp>
996 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
998 return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
1001 /** @brief Load register contents from memory (aligned)
1003 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
1005 template<typename _Tp>
1006 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
1008 return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
1011 /** @brief Load register contents from two memory blocks
1013 @param loptr memory block containing data for first half (0..n/2)
1014 @param hiptr memory block containing data for second half (n/2..n)
1017 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
1018 v_int32x4 r = v_load_halves(lo, hi);
1021 template<typename _Tp>
1022 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1024 v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
1025 for( int i = 0; i < c.nlanes/2; i++ )
1028 c.s[i+c.nlanes/2] = hiptr[i];
1033 /** @brief Load register contents from memory with double expand
1035 Same as cv::v_load, but result pack type will be 2x wider than memory type.
1038 short buf[4] = {1, 2, 3, 4}; // type is int16
1039 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
1041 For 8-, 16-, 32-bit integer source types. */
1042 template<typename _Tp>
1043 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
1044 v_load_expand(const _Tp* ptr)
1046 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1047 v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
1048 for( int i = 0; i < c.nlanes; i++ )
1055 /** @brief Load register contents from memory with quad expand
1057 Same as cv::v_load_expand, but result type is 4 times wider than source.
1059 char buf[4] = {1, 2, 3, 4}; // type is int8
1060 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
1062 For 8-bit integer source types. */
1063 template<typename _Tp>
1064 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
1065 v_load_expand_q(const _Tp* ptr)
1067 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1068 v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
1069 for( int i = 0; i < c.nlanes; i++ )
1076 /** @brief Load and deinterleave (4 channels)
1078 Load data from memory deinterleave and store to 4 registers.
1081 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
1083 For all types except 64-bit. */
1084 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1085 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
1088 for( i = i3 = 0; i < n; i++, i3 += 3 )
1096 /** @brief Load and deinterleave (3 channels)
1098 Load data from memory deinterleave and store to 3 registers.
1101 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
1103 For all types except 64-bit. */
1104 template<typename _Tp, int n>
1105 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1106 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
1110 for( i = i4 = 0; i < n; i++, i4 += 4 )
1119 /** @brief Interleave and store (3 channels)
1121 Interleave and store data from 3 registers to memory.
1124 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
1126 For all types except 64-bit. */
1127 template<typename _Tp, int n>
1128 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1129 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
1132 for( i = i3 = 0; i < n; i++, i3 += 3 )
1140 /** @brief Interleave and store (4 channels)
1142 Interleave and store data from 4 registers to memory.
1145 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
1147 For all types except 64-bit. */
1148 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1149 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
1150 const v_reg<_Tp, n>& d)
1153 for( i = i4 = 0; i < n; i++, i4 += 4 )
1162 /** @brief Store data to memory
1164 Store register contents to memory.
1167 REG {A B C D} ==> MEM {A B C D}
1169 Pointer can be unaligned. */
1170 template<typename _Tp, int n>
1171 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
1173 for( int i = 0; i < n; i++ )
1177 /** @brief Store data to memory (lower half)
1179 Store lower half of register contents to memory.
1182 REG {A B C D} ==> MEM {A B}
1184 template<typename _Tp, int n>
1185 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
1187 for( int i = 0; i < (n/2); i++ )
1191 /** @brief Store data to memory (higher half)
1193 Store higher half of register contents to memory.
1196 REG {A B C D} ==> MEM {C D}
1198 template<typename _Tp, int n>
1199 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
1201 for( int i = 0; i < (n/2); i++ )
1202 ptr[i] = a.s[i+(n/2)];
1205 /** @brief Store data to memory (aligned)
1207 Store register contents to memory.
1210 REG {A B C D} ==> MEM {A B C D}
1212 Pointer __should__ be aligned by 16-byte boundary. */
1213 template<typename _Tp, int n>
1214 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
1216 for( int i = 0; i < n; i++ )
1220 /** @brief Combine vector from first elements of two vectors
1229 For all types except 64-bit. */
1230 template<typename _Tp, int n>
1231 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1234 for( int i = 0; i < (n/2); i++ )
1237 c.s[i+(n/2)] = b.s[i];
1242 /** @brief Combine vector from last elements of two vectors
1251 For all types except 64-bit. */
1252 template<typename _Tp, int n>
1253 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1256 for( int i = 0; i < (n/2); i++ )
1258 c.s[i] = a.s[i+(n/2)];
1259 c.s[i+(n/2)] = b.s[i+(n/2)];
1264 /** @brief Combine two vectors from lower and higher parts of two other vectors
1267 low = cv::v_combine_low(a, b);
1268 high = cv::v_combine_high(a, b);
1270 template<typename _Tp, int n>
1271 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1272 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
1274 for( int i = 0; i < (n/2); i++ )
1277 low.s[i+(n/2)] = b.s[i];
1278 high.s[i] = a.s[i+(n/2)];
1279 high.s[i+(n/2)] = b.s[i+(n/2)];
1283 /** @brief Vector extract
1289 ========================
1290 shift = 1 {A2 A3 A4 B1}
1291 shift = 2 {A3 A4 B1 B2}
1292 shift = 3 {A4 B1 B2 B3}
1294 Restriction: 0 <= shift < nlanes
1299 c = v_extract<2>(a, b);
1301 For integer types only. */
1302 template<int s, typename _Tp, int n>
1303 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1306 const int shift = n - s;
1308 for (; i < shift; ++i)
1311 r.s[i] = b.s[i-shift];
1317 Rounds each value. Input type is float vector ==> output type is int vector.*/
1318 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
1321 for( int i = 0; i < n; i++ )
1322 c.s[i] = cvRound(a.s[i]);
1328 Floor each value. Input type is float vector ==> output type is int vector.*/
1329 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
1332 for( int i = 0; i < n; i++ )
1333 c.s[i] = cvFloor(a.s[i]);
1339 Ceil each value. Input type is float vector ==> output type is int vector.*/
1340 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
1343 for( int i = 0; i < n; i++ )
1344 c.s[i] = cvCeil(a.s[i]);
1350 Truncate each value. Input type is float vector ==> output type is int vector.*/
1351 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
1354 for( int i = 0; i < n; i++ )
1355 c.s[i] = (int)(a.s[i]);
1360 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
1363 for( int i = 0; i < n; i++ )
1365 c.s[i] = cvRound(a.s[i]);
1372 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
1375 for( int i = 0; i < n; i++ )
1377 c.s[i] = cvFloor(a.s[i]);
1384 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
1387 for( int i = 0; i < n; i++ )
1389 c.s[i] = cvCeil(a.s[i]);
1396 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
1399 for( int i = 0; i < n; i++ )
1401 c.s[i] = cvCeil(a.s[i]);
1407 /** @brief Convert to float
1409 Supported input type is cv::v_int32x4. */
1410 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
1413 for( int i = 0; i < n; i++ )
1414 c.s[i] = (float)a.s[i];
1418 /** @brief Convert to double
1420 Supported input type is cv::v_int32x4. */
1421 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
1424 for( int i = 0; i < n; i++ )
1425 c.s[i] = (double)a.s[i];
1429 /** @brief Convert to double
1431 Supported input type is cv::v_float32x4. */
1432 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
1435 for( int i = 0; i < n; i++ )
1436 c.s[i] = (double)a.s[i];
1440 /** @brief Transpose 4x4 matrix
1455 template<typename _Tp>
1456 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
1457 const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
1458 v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
1459 v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
1461 b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
1462 b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
1463 b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
1464 b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
1467 //! @brief Helper macro
1468 //! @ingroup hal_intrin_impl
1469 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
1470 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
1472 //! @name Init with zero
1474 //! @brief Create new vector with zero elements
1475 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
1476 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
1477 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
1478 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
1479 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
1480 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
1481 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
1482 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
1483 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
1484 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
1487 //! @brief Helper macro
1488 //! @ingroup hal_intrin_impl
1489 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
1490 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
1492 //! @name Init with value
1494 //! @brief Create new vector with elements set to a specific value
1495 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
1496 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
1497 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
1498 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
1499 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
1500 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
1501 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
1502 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
1503 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
1504 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
1507 //! @brief Helper macro
1508 //! @ingroup hal_intrin_impl
1509 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
1510 template<typename _Tp0, int n0> inline _Tpvec \
1511 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
1512 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
1514 //! @name Reinterpret
1516 //! @brief Convert vector to different type without modifying underlying data.
1517 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
1518 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
1519 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
1520 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
1521 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
1522 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
1523 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
1524 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
1525 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
1526 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
1529 //! @brief Helper macro
1530 //! @ingroup hal_intrin_impl
1531 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
1532 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1535 //! @name Left shift
1537 //! @brief Shift left
1538 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
1539 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
1540 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
1541 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
1542 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
1543 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
1546 //! @brief Helper macro
1547 //! @ingroup hal_intrin_impl
1548 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
1549 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1552 //! @name Right shift
1554 //! @brief Shift right
1555 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
1556 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
1557 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
1558 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
1559 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
1560 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
1563 //! @brief Helper macro
1564 //! @ingroup hal_intrin_impl
1565 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
1566 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1569 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1570 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1574 //! @name Rounding shift
1576 //! @brief Rounding shift right
1577 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
1578 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
1579 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
1580 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
1581 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
1582 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
1585 //! @brief Helper macro
1586 //! @ingroup hal_intrin_impl
1587 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \
1588 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1591 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1593 c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
1594 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
1601 //! @brief Pack values from two vectors to one
1603 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
1604 //! converts to corresponding unsigned type.
1606 //! - pack: for 16-, 32- and 64-bit integer input types
1607 //! - pack_u: for 16- and 32-bit signed integer input types
1608 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack)
1609 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack)
1610 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack)
1611 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack)
1612 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack)
1613 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack)
1614 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u)
1615 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u)
1618 //! @brief Helper macro
1619 //! @ingroup hal_intrin_impl
1620 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
1621 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1624 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1626 c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1627 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1632 //! @name Pack with rounding shift
1634 //! @brief Pack values from two vectors to one with rounding shift
1636 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
1637 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
1639 //! - pack: for 16-, 32- and 64-bit integer input types
1640 //! - pack_u: for 16- and 32-bit signed integer input types
1641 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
1642 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack)
1643 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
1644 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack)
1645 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
1646 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack)
1647 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
1648 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
1651 //! @brief Helper macro
1652 //! @ingroup hal_intrin_impl
1653 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
1654 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1656 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1657 ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
1660 //! @name Pack and store
1662 //! @brief Store values from the input vector into memory with pack
1664 //! Values will be stored into memory with saturating conversion to narrower type.
1665 //! Variant with _u_ suffix converts to corresponding unsigned type.
1667 //! - pack: for 16-, 32- and 64-bit integer input types
1668 //! - pack_u: for 16- and 32-bit signed integer input types
1669 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
1670 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
1671 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
1672 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
1673 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
1674 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
1675 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
1676 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
1679 //! @brief Helper macro
1680 //! @ingroup hal_intrin_impl
1681 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
1682 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1684 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1685 ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1688 //! @name Pack and store with rounding shift
1690 //! @brief Store values from the input vector into memory with pack
1692 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
1693 //! memory. Variant with _u_ suffix converts to unsigned type.
1695 //! - pack: for 16-, 32- and 64-bit integer input types
1696 //! - pack_u: for 16- and 32-bit signed integer input types
1697 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
1698 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
1699 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
1700 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
1701 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
1702 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
1703 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
1704 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
1707 /** @brief Matrix multiplication
1714 {D0 D1 D2 D3} x |V3|
1715 ====================
1716 {R0 R1 R2 R3}, where:
1717 R0 = A0V0 + A1V1 + A2V2 + A3V3,
1718 R1 = B0V0 + B1V1 + B2V2 + B3V3
1722 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
1723 const v_float32x4& m1, const v_float32x4& m2,
1724 const v_float32x4& m3)
1726 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
1727 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
1728 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
1729 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);