1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_INTRIN_CPP_HPP
46 #define OPENCV_HAL_INTRIN_CPP_HPP
51 #include "opencv2/core/saturate.hpp"
57 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
60 /** @addtogroup core_hal_intrin
62 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
63 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
64 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
65 containing packed values of different types. In case when there is no SIMD extension available
66 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
67 expected although it could be slower.
71 There are several types representing 128-bit register as a vector of packed values, each type is
72 implemented as a structure based on a one SIMD register.
74 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
75 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
76 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
77 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
78 - cv::v_float32x4: four 32-bit floating point values (signed) - float
79 - cv::v_float64x2: two 64-bit floating point valies (signed) - double
82 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
83 check the CV_SIMD128_64F preprocessor definition:
90 ### Load and store operations
92 These operations allow to set contents of the register explicitly or by loading it from some memory
93 block and to save contents of the register to memory block.
96 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
97 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
98 - Other create methods:
99 @ref v_setall_s8, @ref v_setall_u8, ...,
100 @ref v_setzero_u8, @ref v_setzero_s8, ...
102 @ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
103 @ref v_store, @ref v_store_aligned,
104 @ref v_store_high, @ref v_store_low
108 These operations allow to reorder or recombine elements in one or multiple vectors.
110 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
111 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
112 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
113 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
114 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
115 - Extract: @ref v_extract
118 ### Arithmetic, bitwise and comparison operations
120 Element-wise binary and unary operations.
123 @ref operator +(const v_reg &a, const v_reg &b) "+",
124 @ref operator -(const v_reg &a, const v_reg &b) "-",
125 @ref operator *(const v_reg &a, const v_reg &b) "*",
126 @ref operator /(const v_reg &a, const v_reg &b) "/",
129 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
132 @ref operator <<(const v_reg &a, int s) "<<",
133 @ref operator >>(const v_reg &a, int s) ">>",
134 @ref v_shl, @ref v_shr
137 @ref operator&(const v_reg &a, const v_reg &b) "&",
138 @ref operator |(const v_reg &a, const v_reg &b) "|",
139 @ref operator ^(const v_reg &a, const v_reg &b) "^",
140 @ref operator ~(const v_reg &a) "~"
143 @ref operator >(const v_reg &a, const v_reg &b) ">",
144 @ref operator >=(const v_reg &a, const v_reg &b) ">=",
145 @ref operator <(const v_reg &a, const v_reg &b) "<",
146 @ref operator <=(const v_reg &a, const v_reg &b) "<=",
147 @ref operator==(const v_reg &a, const v_reg &b) "==",
148 @ref operator !=(const v_reg &a, const v_reg &b) "!="
150 - min/max: @ref v_min, @ref v_max
154 Most of these operations return only one value.
156 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
157 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
161 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
162 - Absolute values: @ref v_abs, @ref v_absdiff
166 Different type conversions and casts:
168 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
169 - To float: @ref v_cvt_f32, @ref v_cvt_f64
170 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
172 ### Matrix operations
174 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
178 Most operations are implemented only for some subset of the available types, following matrices
179 shows the applicability of different operations to the types.
183 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
184 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
185 |load, store | x | x | x | x | x | x |
186 |interleave | x | x | x | x | x | x |
187 |expand | x | x | x | x | x | x |
188 |expand_q | x | x | | | | |
189 |add, sub | x | x | x | x | x | x |
190 |add_wrap, sub_wrap | x | x | x | x | | |
191 |mul | | | x | x | x | x |
192 |mul_expand | | | x | x | x | |
193 |compare | x | x | x | x | x | x |
194 |shift | | | x | x | x | x |
195 |dotprod | | | | x | | |
196 |logical | x | x | x | x | x | x |
197 |min, max | x | x | x | x | x | x |
198 |absdiff | x | x | x | x | x | x |
199 |reduce | | | | | x | x |
200 |mask | x | x | x | x | x | x |
201 |pack | x | x | x | x | x | x |
202 |pack_u | x | | x | | | |
203 |unpack | x | x | x | x | x | x |
204 |extract | x | x | x | x | x | x |
205 |rotate (lanes) | x | x | x | x | x | x |
206 |cvt_flt32 | | | | | | x |
207 |cvt_flt64 | | | | | | x |
208 |transpose4x4 | | | | | x | x |
212 | Operations\\Types | uint 64x2 | int 64x2 |
213 |-------------------|:-:|:-:|
214 |load, store | x | x |
219 |rotate (lanes) | x | x |
223 | Operations\\Types | float 32x4 | float 64x2 |
224 |-------------------|:-:|:-:|
225 |load, store | x | x |
239 |float math | x | x |
240 |transpose4x4 | x | |
242 |rotate (lanes) | x | x |
246 template<typename _Tp, int n> struct v_reg
249 typedef _Tp lane_type;
253 /** @brief Constructor
255 Initializes register with data from memory
256 @param ptr pointer to memory block with data for register */
257 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
259 /** @brief Constructor
261 Initializes register with two 64-bit values */
262 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
264 /** @brief Constructor
266 Initializes register with four 32-bit values */
267 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
269 /** @brief Constructor
271 Initializes register with eight 16-bit values */
272 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
273 _Tp s4, _Tp s5, _Tp s6, _Tp s7)
275 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
276 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
279 /** @brief Constructor
281 Initializes register with sixteen 8-bit values */
282 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
283 _Tp s4, _Tp s5, _Tp s6, _Tp s7,
284 _Tp s8, _Tp s9, _Tp s10, _Tp s11,
285 _Tp s12, _Tp s13, _Tp s14, _Tp s15)
287 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
288 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
289 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
290 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
293 /** @brief Default constructor
295 Does not initialize anything*/
298 /** @brief Copy constructor */
299 v_reg(const v_reg<_Tp, n> & r)
301 for( int i = 0; i < n; i++ )
304 /** @brief Access first value
306 Returns value of the first lane according to register type, for example:
308 v_int32x4 r(1, 2, 3, 4);
309 int v = r.get0(); // returns 1
311 uint64_t v = r.get0(); // returns 1
314 _Tp get0() const { return s[0]; }
317 _Tp get(const int i) const { return s[i]; }
318 v_reg<_Tp, n> high() const
322 for( i = 0; i < n/2; i++ )
330 static v_reg<_Tp, n> zero()
333 for( int i = 0; i < n; i++ )
338 static v_reg<_Tp, n> all(_Tp s)
341 for( int i = 0; i < n; i++ )
346 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
348 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
350 std::memcpy(&c.s[0], &s[0], bytes);
358 /** @brief Sixteen 8-bit unsigned integer values */
359 typedef v_reg<uchar, 16> v_uint8x16;
360 /** @brief Sixteen 8-bit signed integer values */
361 typedef v_reg<schar, 16> v_int8x16;
362 /** @brief Eight 16-bit unsigned integer values */
363 typedef v_reg<ushort, 8> v_uint16x8;
364 /** @brief Eight 16-bit signed integer values */
365 typedef v_reg<short, 8> v_int16x8;
366 /** @brief Four 32-bit unsigned integer values */
367 typedef v_reg<unsigned, 4> v_uint32x4;
368 /** @brief Four 32-bit signed integer values */
369 typedef v_reg<int, 4> v_int32x4;
370 /** @brief Four 32-bit floating point values (single precision) */
371 typedef v_reg<float, 4> v_float32x4;
372 /** @brief Two 64-bit floating point values (double precision) */
373 typedef v_reg<double, 2> v_float64x2;
374 /** @brief Two 64-bit unsigned integer values */
375 typedef v_reg<uint64, 2> v_uint64x2;
376 /** @brief Two 64-bit signed integer values */
377 typedef v_reg<int64, 2> v_int64x2;
379 //! @brief Helper macro
380 //! @ingroup core_hal_intrin_impl
381 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
382 template<typename _Tp, int n> inline v_reg<_Tp, n> \
383 operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
386 for( int i = 0; i < n; i++ ) \
387 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
390 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
391 operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
393 for( int i = 0; i < n; i++ ) \
394 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
398 /** @brief Add values
401 OPENCV_HAL_IMPL_BIN_OP(+)
403 /** @brief Subtract values
406 OPENCV_HAL_IMPL_BIN_OP(-)
408 /** @brief Multiply values
410 For 16- and 32-bit integer types and floating types. */
411 OPENCV_HAL_IMPL_BIN_OP(*)
413 /** @brief Divide values
415 For floating types only. */
416 OPENCV_HAL_IMPL_BIN_OP(/)
418 //! @brief Helper macro
419 //! @ingroup core_hal_intrin_impl
420 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
421 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
422 (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
425 typedef typename V_TypeTraits<_Tp>::int_type itype; \
426 for( int i = 0; i < n; i++ ) \
427 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
428 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
431 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
432 bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
434 typedef typename V_TypeTraits<_Tp>::int_type itype; \
435 for( int i = 0; i < n; i++ ) \
436 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
437 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
441 /** @brief Bitwise AND
443 Only for integer types. */
444 OPENCV_HAL_IMPL_BIT_OP(&)
446 /** @brief Bitwise OR
448 Only for integer types. */
449 OPENCV_HAL_IMPL_BIT_OP(|)
451 /** @brief Bitwise XOR
453 Only for integer types.*/
454 OPENCV_HAL_IMPL_BIT_OP(^)
456 /** @brief Bitwise NOT
458 Only for integer types.*/
459 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
462 for( int i = 0; i < n; i++ )
464 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
469 //! @brief Helper macro
470 //! @ingroup core_hal_intrin_impl
471 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
472 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
475 for( int i = 0; i < n; i++ ) \
476 c.s[i] = cfunc(a.s[i]); \
480 /** @brief Square root of elements
482 Only for floating point types.*/
483 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
486 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
487 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
488 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
489 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
492 /** @brief Absolute value of elements
494 Only for floating point types.*/
495 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
496 typename V_TypeTraits<_Tp>::abs_type)
498 /** @brief Round elements
500 Only for floating point types.*/
501 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
503 /** @brief Floor elements
505 Only for floating point types.*/
506 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
508 /** @brief Ceil elements
510 Only for floating point types.*/
511 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
513 /** @brief Truncate elements
515 Only for floating point types.*/
516 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
518 //! @brief Helper macro
519 //! @ingroup core_hal_intrin_impl
520 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
521 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
524 for( int i = 0; i < n; i++ ) \
525 c.s[i] = cfunc(a.s[i], b.s[i]); \
529 //! @brief Helper macro
530 //! @ingroup core_hal_intrin_impl
531 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
532 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
535 for( int i = 1; i < n; i++ ) \
536 c = cfunc(c, a.s[i]); \
540 /** @brief Choose min values for each pair
547 {min(A1,B1) min(A2,B2) ...}
549 For all types except 64-bit integer. */
550 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
552 /** @brief Choose max values for each pair
559 {max(A1,B1) max(A2,B2) ...}
561 For all types except 64-bit integer. */
562 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
564 /** @brief Find one min value
568 {A1 A2 A3 ...} => min(A1,A2,A3,...)
570 For 32-bit integer and 32-bit floating point types. */
571 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
573 /** @brief Find one max value
577 {A1 A2 A3 ...} => max(A1,A2,A3,...)
579 For 32-bit integer and 32-bit floating point types. */
580 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
582 static const unsigned char popCountTable[] =
584 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
585 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
586 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
587 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
588 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
589 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
590 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
591 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
592 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
593 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
594 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
595 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
596 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
597 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
598 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
599 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
601 /** @brief Count the 1 bits in the vector and return 4 values
605 {A1 A2 A3 ...} => popcount(A1)
607 Any types but result will be in v_uint32x4*/
608 template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
611 b = v_reinterpret_as_u8(a);
612 for( int i = 0; i < v_uint8x16::nlanes; i++ )
614 b.s[i] = popCountTable[b.s[i]];
617 for( int i = 0; i < v_uint32x4::nlanes; i++ )
619 c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
626 template<typename _Tp, int n>
627 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
628 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
630 for( int i = 0; i < n; i++ )
632 minval.s[i] = std::min(a.s[i], b.s[i]);
633 maxval.s[i] = std::max(a.s[i], b.s[i]);
638 //! @brief Helper macro
639 //! @ingroup core_hal_intrin_impl
640 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
641 template<typename _Tp, int n> \
642 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
644 typedef typename V_TypeTraits<_Tp>::int_type itype; \
646 for( int i = 0; i < n; i++ ) \
647 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
651 /** @brief Less-than comparison
653 For all types except 64-bit integer values. */
654 OPENCV_HAL_IMPL_CMP_OP(<)
656 /** @brief Greater-than comparison
658 For all types except 64-bit integer values. */
659 OPENCV_HAL_IMPL_CMP_OP(>)
661 /** @brief Less-than or equal comparison
663 For all types except 64-bit integer values. */
664 OPENCV_HAL_IMPL_CMP_OP(<=)
666 /** @brief Greater-than or equal comparison
668 For all types except 64-bit integer values. */
669 OPENCV_HAL_IMPL_CMP_OP(>=)
671 /** @brief Equal comparison
673 For all types except 64-bit integer values. */
674 OPENCV_HAL_IMPL_CMP_OP(==)
676 /** @brief Not equal comparison
678 For all types except 64-bit integer values. */
679 OPENCV_HAL_IMPL_CMP_OP(!=)
681 //! @brief Helper macro
682 //! @ingroup core_hal_intrin_impl
683 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
684 template<typename _Tp, int n> \
685 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
687 typedef _Tp2 rtype; \
689 for( int i = 0; i < n; i++ ) \
690 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
694 /** @brief Add values without saturation
696 For 8- and 16-bit integer values. */
697 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
699 /** @brief Subtract values without saturation
701 For 8- and 16-bit integer values. */
702 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
705 template<typename T> inline T _absdiff(T a, T b)
707 return a > b ? a - b : b - a;
711 /** @brief Absolute difference
713 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
716 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
717 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
719 For 8-, 16-, 32-bit integer source types. */
720 template<typename _Tp, int n>
721 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
723 typedef typename V_TypeTraits<_Tp>::abs_type rtype;
725 const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
726 for( int i = 0; i < n; i++ )
728 rtype ua = a.s[i] ^ mask;
729 rtype ub = b.s[i] ^ mask;
730 c.s[i] = _absdiff(ua, ub);
737 For 32-bit floating point values */
738 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
741 for( int i = 0; i < c.nlanes; i++ )
742 c.s[i] = _absdiff(a.s[i], b.s[i]);
748 For 64-bit floating point values */
749 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
752 for( int i = 0; i < c.nlanes; i++ )
753 c.s[i] = _absdiff(a.s[i], b.s[i]);
757 /** @brief Inversed square root
759 Returns \f$ 1/sqrt(a) \f$
760 For floating point types only. */
761 template<typename _Tp, int n>
762 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
765 for( int i = 0; i < n; i++ )
766 c.s[i] = 1.f/std::sqrt(a.s[i]);
772 Returns \f$ sqrt(a^2 + b^2) \f$
773 For floating point types only. */
774 template<typename _Tp, int n>
775 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
778 for( int i = 0; i < n; i++ )
779 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
783 /** @brief Square of the magnitude
785 Returns \f$ a^2 + b^2 \f$
786 For floating point types only. */
787 template<typename _Tp, int n>
788 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
791 for( int i = 0; i < n; i++ )
792 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
796 /** @brief Multiply and add
798 Returns \f$ a*b + c \f$
799 For floating point types and signed 32bit int only. */
800 template<typename _Tp, int n>
801 inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
802 const v_reg<_Tp, n>& c)
805 for( int i = 0; i < n; i++ )
806 d.s[i] = a.s[i]*b.s[i] + c.s[i];
810 /** @brief A synonym for v_fma */
811 template<typename _Tp, int n>
812 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
813 const v_reg<_Tp, n>& c)
815 return v_fma(a, b, c);
818 /** @brief Dot product of elements
820 Multiply values in two registers and sum adjacent result pairs.
823 {A1 A2 ...} // 16-bit
824 x {B1 B2 ...} // 16-bit
826 {A1B1+A2B2 ...} // 32-bit
828 Implemented only for 16-bit signed source type (v_int16x8).
830 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
831 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
833 typedef typename V_TypeTraits<_Tp>::w_type w_type;
834 v_reg<w_type, n/2> c;
835 for( int i = 0; i < (n/2); i++ )
836 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
840 /** @brief Dot product of elements
842 Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
845 {A1 A2 ...} // 16-bit
846 x {B1 B2 ...} // 16-bit
848 {A1B1+A2B2+C1 ...} // 32-bit
851 Implemented only for 16-bit signed source type (v_int16x8).
853 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
854 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
856 typedef typename V_TypeTraits<_Tp>::w_type w_type;
857 v_reg<w_type, n/2> s;
858 for( int i = 0; i < (n/2); i++ )
859 s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
863 /** @brief Multiply and expand
865 Multiply values two registers and store results in two registers with wider pack type.
869 x {E F G H} // 32-bit
876 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
877 v_uint64x2 c, d; // results
878 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
880 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
882 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
883 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
884 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
886 typedef typename V_TypeTraits<_Tp>::w_type w_type;
887 for( int i = 0; i < (n/2); i++ )
889 c.s[i] = (w_type)a.s[i]*b.s[i];
890 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
895 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
896 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
898 typedef typename V_TypeTraits<_Tp>::w_type w_type;
899 for( int i = 0; i < (n/2); i++ )
901 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
906 //! @brief Helper macro
907 //! @ingroup core_hal_intrin_impl
908 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
909 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
912 for( int i = 0; i < n; i++ ) \
913 c.s[i] = (_Tp)(a.s[i] shift_op imm); \
917 /** @brief Bitwise shift left
919 For 16-, 32- and 64-bit integer values. */
920 OPENCV_HAL_IMPL_SHIFT_OP(<< )
922 /** @brief Bitwise shift right
924 For 16-, 32- and 64-bit integer values. */
925 OPENCV_HAL_IMPL_SHIFT_OP(>> )
927 /** @brief Element shift left among vector
930 #define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
931 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
934 for (int i = 0; i < n; i++) \
936 int sIndex = i opA imm; \
937 if (0 <= sIndex && sIndex < n) \
939 b.s[i] = a.s[sIndex]; \
948 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
951 for (int i = 0; i < n; i++) \
953 int aIndex = i opA imm; \
954 int bIndex = i opA imm opB n; \
955 if (0 <= bIndex && bIndex < n) \
957 c.s[i] = b.s[bIndex]; \
959 else if (0 <= aIndex && aIndex < n) \
961 c.s[i] = a.s[aIndex]; \
971 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
972 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
974 /** @brief Sum packed values
978 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
980 For 32-bit integer and 32-bit floating point types.*/
981 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
983 typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
984 for( int i = 1; i < n; i++ )
989 /** @brief Sums all elements of each input vector, returns the vector of sums
993 result[0] = a[0] + a[1] + a[2] + a[3]
994 result[1] = b[0] + b[1] + b[2] + b[3]
995 result[2] = c[0] + c[1] + c[2] + c[3]
996 result[3] = d[0] + d[1] + d[2] + d[3]
999 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1000 const v_float32x4& c, const v_float32x4& d)
1003 r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
1004 r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
1005 r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
1006 r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
1010 /** @brief Get negative values mask
1012 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
1015 v_int32x4 r; // set to {-1, -1, 1, 1}
1016 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
1018 For all types except 64-bit. */
1019 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
1022 for( int i = 0; i < n; i++ )
1023 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
1027 /** @brief Check if all packed values are less than zero
1029 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1030 For all types except 64-bit. */
1031 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
1033 for( int i = 0; i < n; i++ )
1034 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
1039 /** @brief Check if any of packed values is less than zero
1041 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1042 For all types except 64-bit. */
1043 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
1045 for( int i = 0; i < n; i++ )
1046 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
1051 /** @brief Per-element select (blend operation)
1053 Return value will be built by combining values _a_ and _b_ using the following scheme:
1054 result[i] = mask[i] ? a[i] : b[i];
1056 @note: _mask_ element values are restricted to these values:
1057 - 0: select element from _b_
1058 - 0xff/0xffff/etc: select element from _a_
1059 (fully compatible with bitwise-based operator)
1061 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
1062 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1064 typedef V_TypeTraits<_Tp> Traits;
1065 typedef typename Traits::int_type int_type;
1067 for( int i = 0; i < n; i++ )
1069 int_type m = Traits::reinterpret_int(mask.s[i]);
1070 CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc
1071 c.s[i] = m ? a.s[i] : b.s[i];
1076 /** @brief Expand values to the wider pack type
1078 Copy contents of register to two registers with 2x wider pack type.
1081 int32x4 int64x2 int64x2
1082 {A B C D} ==> {A B} , {C D}
1084 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
1085 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
1086 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
1088 for( int i = 0; i < (n/2); i++ )
1091 b1.s[i] = a.s[i+(n/2)];
1096 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
1097 v_reinterpret_as_int(const v_reg<_Tp, n>& a)
1099 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
1100 for( int i = 0; i < n; i++ )
1101 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
1105 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
1106 v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
1108 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
1109 for( int i = 0; i < n; i++ )
1110 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
1115 /** @brief Interleave two vectors
1122 {A1 B1 A2 B2} and {A3 B3 A4 B4}
1124 For all types except 64-bit.
1126 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
1127 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
1130 for( i = 0; i < n/2; i++ )
1132 b0.s[i*2] = a0.s[i];
1133 b0.s[i*2+1] = a1.s[i];
1137 b1.s[i*2-n] = a0.s[i];
1138 b1.s[i*2-n+1] = a1.s[i];
1142 /** @brief Load register contents from memory
1144 @param ptr pointer to memory block with data
1145 @return register object
1147 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
1149 template<typename _Tp>
1150 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
1152 return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
1155 /** @brief Load register contents from memory (aligned)
1157 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
1159 template<typename _Tp>
1160 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
1162 return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
1165 /** @brief Load 64-bits of data to lower part (high part is undefined).
1167 @param ptr memory block containing data for first half (0..n/2)
1170 int lo[2] = { 1, 2 };
1171 v_int32x4 r = v_load_low(lo);
1174 template<typename _Tp>
1175 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
1177 v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
1178 for( int i = 0; i < c.nlanes/2; i++ )
1185 /** @brief Load register contents from two memory blocks
1187 @param loptr memory block containing data for first half (0..n/2)
1188 @param hiptr memory block containing data for second half (n/2..n)
1191 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
1192 v_int32x4 r = v_load_halves(lo, hi);
1195 template<typename _Tp>
1196 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1198 v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
1199 for( int i = 0; i < c.nlanes/2; i++ )
1202 c.s[i+c.nlanes/2] = hiptr[i];
1207 /** @brief Load register contents from memory with double expand
1209 Same as cv::v_load, but result pack type will be 2x wider than memory type.
1212 short buf[4] = {1, 2, 3, 4}; // type is int16
1213 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
1215 For 8-, 16-, 32-bit integer source types. */
1216 template<typename _Tp>
1217 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
1218 v_load_expand(const _Tp* ptr)
1220 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1221 v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
1222 for( int i = 0; i < c.nlanes; i++ )
1229 /** @brief Load register contents from memory with quad expand
1231 Same as cv::v_load_expand, but result type is 4 times wider than source.
1233 char buf[4] = {1, 2, 3, 4}; // type is int8
1234 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
1236 For 8-bit integer source types. */
1237 template<typename _Tp>
1238 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
1239 v_load_expand_q(const _Tp* ptr)
1241 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1242 v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
1243 for( int i = 0; i < c.nlanes; i++ )
1250 /** @brief Load and deinterleave (2 channels)
1252 Load data from memory deinterleave and store to 2 registers.
1255 {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
1257 For all types except 64-bit. */
1258 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1262 for( i = i2 = 0; i < n; i++, i2 += 2 )
1269 /** @brief Load and deinterleave (3 channels)
1271 Load data from memory deinterleave and store to 3 registers.
1274 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
1276 For all types except 64-bit. */
1277 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1278 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
1281 for( i = i3 = 0; i < n; i++, i3 += 3 )
1289 /** @brief Load and deinterleave (4 channels)
1291 Load data from memory deinterleave and store to 4 registers.
1294 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
1296 For all types except 64-bit. */
1297 template<typename _Tp, int n>
1298 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1299 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
1303 for( i = i4 = 0; i < n; i++, i4 += 4 )
1312 /** @brief Interleave and store (2 channels)
1314 Interleave and store data from 2 registers to memory.
1317 {A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
1319 For all types except 64-bit. */
1320 template<typename _Tp, int n>
1321 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1322 const v_reg<_Tp, n>& b)
1325 for( i = i2 = 0; i < n; i++, i2 += 2 )
1332 /** @brief Interleave and store (3 channels)
1334 Interleave and store data from 3 registers to memory.
1337 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
1339 For all types except 64-bit. */
1340 template<typename _Tp, int n>
1341 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1342 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
1345 for( i = i3 = 0; i < n; i++, i3 += 3 )
1353 /** @brief Interleave and store (4 channels)
1355 Interleave and store data from 4 registers to memory.
1358 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
1360 For all types except 64-bit. */
1361 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1362 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
1363 const v_reg<_Tp, n>& d)
1366 for( i = i4 = 0; i < n; i++, i4 += 4 )
1375 /** @brief Store data to memory
1377 Store register contents to memory.
1380 REG {A B C D} ==> MEM {A B C D}
1382 Pointer can be unaligned. */
1383 template<typename _Tp, int n>
1384 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
1386 for( int i = 0; i < n; i++ )
1390 /** @brief Store data to memory (lower half)
1392 Store lower half of register contents to memory.
1395 REG {A B C D} ==> MEM {A B}
1397 template<typename _Tp, int n>
1398 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
1400 for( int i = 0; i < (n/2); i++ )
1404 /** @brief Store data to memory (higher half)
1406 Store higher half of register contents to memory.
1409 REG {A B C D} ==> MEM {C D}
1411 template<typename _Tp, int n>
1412 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
1414 for( int i = 0; i < (n/2); i++ )
1415 ptr[i] = a.s[i+(n/2)];
1418 /** @brief Store data to memory (aligned)
1420 Store register contents to memory.
1423 REG {A B C D} ==> MEM {A B C D}
1425 Pointer __should__ be aligned by 16-byte boundary. */
1426 template<typename _Tp, int n>
1427 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
1429 for( int i = 0; i < n; i++ )
1433 /** @brief Combine vector from first elements of two vectors
1442 For all types except 64-bit. */
1443 template<typename _Tp, int n>
1444 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1447 for( int i = 0; i < (n/2); i++ )
1450 c.s[i+(n/2)] = b.s[i];
1455 /** @brief Combine vector from last elements of two vectors
1464 For all types except 64-bit. */
1465 template<typename _Tp, int n>
1466 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1469 for( int i = 0; i < (n/2); i++ )
1471 c.s[i] = a.s[i+(n/2)];
1472 c.s[i+(n/2)] = b.s[i+(n/2)];
1477 /** @brief Combine two vectors from lower and higher parts of two other vectors
1480 low = cv::v_combine_low(a, b);
1481 high = cv::v_combine_high(a, b);
1483 template<typename _Tp, int n>
1484 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1485 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
1487 for( int i = 0; i < (n/2); i++ )
1490 low.s[i+(n/2)] = b.s[i];
1491 high.s[i] = a.s[i+(n/2)];
1492 high.s[i+(n/2)] = b.s[i+(n/2)];
1496 /** @brief Vector extract
1502 ========================
1503 shift = 1 {A2 A3 A4 B1}
1504 shift = 2 {A3 A4 B1 B2}
1505 shift = 3 {A4 B1 B2 B3}
1507 Restriction: 0 <= shift < nlanes
1512 c = v_extract<2>(a, b);
1515 template<int s, typename _Tp, int n>
1516 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1519 const int shift = n - s;
1521 for (; i < shift; ++i)
1524 r.s[i] = b.s[i-shift];
1530 Rounds each value. Input type is float vector ==> output type is int vector.*/
1531 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
1534 for( int i = 0; i < n; i++ )
1535 c.s[i] = cvRound(a.s[i]);
1541 Floor each value. Input type is float vector ==> output type is int vector.*/
1542 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
1545 for( int i = 0; i < n; i++ )
1546 c.s[i] = cvFloor(a.s[i]);
1552 Ceil each value. Input type is float vector ==> output type is int vector.*/
1553 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
1556 for( int i = 0; i < n; i++ )
1557 c.s[i] = cvCeil(a.s[i]);
1563 Truncate each value. Input type is float vector ==> output type is int vector.*/
1564 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
1567 for( int i = 0; i < n; i++ )
1568 c.s[i] = (int)(a.s[i]);
1573 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
1576 for( int i = 0; i < n; i++ )
1578 c.s[i] = cvRound(a.s[i]);
1585 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
1588 for( int i = 0; i < n; i++ )
1590 c.s[i] = cvFloor(a.s[i]);
1597 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
1600 for( int i = 0; i < n; i++ )
1602 c.s[i] = cvCeil(a.s[i]);
1609 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
1612 for( int i = 0; i < n; i++ )
1614 c.s[i] = cvCeil(a.s[i]);
1620 /** @brief Convert to float
1622 Supported input type is cv::v_int32x4. */
1623 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
1626 for( int i = 0; i < n; i++ )
1627 c.s[i] = (float)a.s[i];
1631 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
1633 v_reg<float, n*2> c;
1634 for( int i = 0; i < n; i++ )
1636 c.s[i] = (float)a.s[i];
1637 c.s[i+n] = (float)b.s[i];
1642 /** @brief Convert to double
1644 Supported input type is cv::v_int32x4. */
1645 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
1648 for( int i = 0; i < n; i++ )
1649 c.s[i] = (double)a.s[i];
1653 /** @brief Convert to double
1655 Supported input type is cv::v_float32x4. */
1656 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
1659 for( int i = 0; i < n; i++ )
1660 c.s[i] = (double)a.s[i];
1664 template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
1667 for( int i = 0; i < n; i++ )
1668 c.s[i] = tab[idx.s[i]];
1672 template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
1675 for( int i = 0; i < n; i++ )
1676 c.s[i] = tab[idx.s[i]];
1680 template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
1683 for( int i = 0; i < n; i++ )
1684 c.s[i] = tab[idx.s[i]];
1688 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
1689 v_reg<float, n>& x, v_reg<float, n>& y)
1691 for( int i = 0; i < n; i++ )
1699 template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
1700 v_reg<double, n>& x, v_reg<double, n>& y)
1702 for( int i = 0; i < n; i++ )
1710 /** @brief Transpose 4x4 matrix
1725 template<typename _Tp>
1726 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
1727 const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
1728 v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
1729 v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
1731 b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
1732 b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
1733 b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
1734 b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
1737 //! @brief Helper macro
1738 //! @ingroup core_hal_intrin_impl
1739 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
1740 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
1742 //! @name Init with zero
1744 //! @brief Create new vector with zero elements
1745 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
1746 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
1747 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
1748 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
1749 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
1750 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
1751 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
1752 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
1753 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
1754 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
1757 //! @brief Helper macro
1758 //! @ingroup core_hal_intrin_impl
1759 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
1760 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
1762 //! @name Init with value
1764 //! @brief Create new vector with elements set to a specific value
1765 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
1766 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
1767 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
1768 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
1769 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
1770 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
1771 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
1772 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
1773 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
1774 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
1777 //! @brief Helper macro
1778 //! @ingroup core_hal_intrin_impl
1779 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
1780 template<typename _Tp0, int n0> inline _Tpvec \
1781 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
1782 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
1784 //! @name Reinterpret
1786 //! @brief Convert vector to different type without modifying underlying data.
1787 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
1788 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
1789 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
1790 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
1791 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
1792 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
1793 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
1794 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
1795 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
1796 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
1799 //! @brief Helper macro
1800 //! @ingroup core_hal_intrin_impl
1801 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
1802 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1805 //! @name Left shift
1807 //! @brief Shift left
1808 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
1809 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
1810 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
1811 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
1812 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
1813 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
1816 //! @brief Helper macro
1817 //! @ingroup core_hal_intrin_impl
1818 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
1819 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1822 //! @name Right shift
1824 //! @brief Shift right
1825 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
1826 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
1827 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
1828 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
1829 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
1830 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
1833 //! @brief Helper macro
1834 //! @ingroup core_hal_intrin_impl
1835 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
1836 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1839 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1840 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1844 //! @name Rounding shift
1846 //! @brief Rounding shift right
1847 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
1848 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
1849 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
1850 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
1851 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
1852 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
1855 //! @brief Helper macro
1856 //! @ingroup core_hal_intrin_impl
1857 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
1858 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1861 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1863 c.s[i] = cast<_Tpn>(a.s[i]); \
1864 c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
1871 //! @brief Pack values from two vectors to one
1873 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
1874 //! converts to corresponding unsigned type.
1876 //! - pack: for 16-, 32- and 64-bit integer input types
1877 //! - pack_u: for 16- and 32-bit signed integer input types
1879 //! @note All variants except 64-bit use saturation.
1880 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
1881 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
1882 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
1883 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
1884 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
1885 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
1886 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
1887 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
1890 //! @brief Helper macro
1891 //! @ingroup core_hal_intrin_impl
1892 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
1893 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1896 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1898 c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1899 c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1904 //! @name Pack with rounding shift
1906 //! @brief Pack values from two vectors to one with rounding shift
1908 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
1909 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
1911 //! - pack: for 16-, 32- and 64-bit integer input types
1912 //! - pack_u: for 16- and 32-bit signed integer input types
1914 //! @note All variants except 64-bit use saturation.
1915 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
1916 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
1917 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
1918 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
1919 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
1920 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
1921 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
1922 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
1925 //! @brief Helper macro
1926 //! @ingroup core_hal_intrin_impl
1927 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
1928 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1930 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1931 ptr[i] = cast<_Tpn>(a.s[i]); \
1934 //! @name Pack and store
1936 //! @brief Store values from the input vector into memory with pack
1938 //! Values will be stored into memory with conversion to narrower type.
1939 //! Variant with _u_ suffix converts to corresponding unsigned type.
1941 //! - pack: for 16-, 32- and 64-bit integer input types
1942 //! - pack_u: for 16- and 32-bit signed integer input types
1944 //! @note All variants except 64-bit use saturation.
1945 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
1946 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
1947 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
1948 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
1949 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
1950 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
1951 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
1952 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
1955 //! @brief Helper macro
1956 //! @ingroup core_hal_intrin_impl
1957 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
1958 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1960 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1961 ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1964 //! @name Pack and store with rounding shift
1966 //! @brief Store values from the input vector into memory with pack
1968 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
1969 //! memory. Variant with _u_ suffix converts to unsigned type.
1971 //! - pack: for 16-, 32- and 64-bit integer input types
1972 //! - pack_u: for 16- and 32-bit signed integer input types
1974 //! @note All variants except 64-bit use saturation.
1975 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
1976 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
1977 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
1978 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
1979 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
1980 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
1981 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
1982 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
1985 /** @brief Matrix multiplication
1992 {D0 D1 D2 D3} x |V3|
1993 ====================
1994 {R0 R1 R2 R3}, where:
1995 R0 = A0V0 + A1V1 + A2V2 + A3V3,
1996 R1 = B0V0 + B1V1 + B2V2 + B3V3
2000 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
2001 const v_float32x4& m1, const v_float32x4& m2,
2002 const v_float32x4& m3)
2004 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
2005 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
2006 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
2007 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
2010 /** @brief Matrix multiplication and add
2014 {A0 A1 A2 } |V0| |D0|
2015 {B0 B1 B2 } |V1| |D1|
2016 {C0 C1 C2 } x |V2| + |D2|
2017 ====================
2018 {R0 R1 R2 R3}, where:
2019 R0 = A0V0 + A1V1 + A2V2 + D0,
2020 R1 = B0V0 + B1V1 + B2V2 + D1
2024 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
2025 const v_float32x4& m1, const v_float32x4& m2,
2026 const v_float32x4& m3)
2028 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
2029 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
2030 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
2031 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
2034 inline void v_cleanup() {}
2038 //! @name Check SIMD support
2040 //! @brief Check CPU capability of SIMD operation
2041 static inline bool hasSIMD128()
2049 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END