1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_INTRIN_CPP_HPP
46 #define OPENCV_HAL_INTRIN_CPP_HPP
51 #include "opencv2/core/saturate.hpp"
57 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
60 /** @addtogroup core_hal_intrin
62 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
63 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
64 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
65 containing packed values of different types. In case when there is no SIMD extension available
66 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
67 expected although it could be slower.
71 There are several types representing 128-bit register as a vector of packed values, each type is
72 implemented as a structure based on a one SIMD register.
74 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
75 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
76 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
77 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
78 - cv::v_float32x4: four 32-bit floating point values (signed) - float
79 - cv::v_float64x2: two 64-bit floating point valies (signed) - double
82 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
83 check the CV_SIMD128_64F preprocessor definition:
90 ### Load and store operations
92 These operations allow to set contents of the register explicitly or by loading it from some memory
93 block and to save contents of the register to memory block.
96 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
97 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
98 - Other create methods:
99 @ref v_setall_s8, @ref v_setall_u8, ...,
100 @ref v_setzero_u8, @ref v_setzero_s8, ...
102 @ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
103 @ref v_store, @ref v_store_aligned,
104 @ref v_store_high, @ref v_store_low
108 These operations allow to reorder or recombine elements in one or multiple vectors.
110 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
111 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
112 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
113 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
114 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
115 - Extract: @ref v_extract
118 ### Arithmetic, bitwise and comparison operations
120 Element-wise binary and unary operations.
123 @ref operator +(const v_reg &a, const v_reg &b) "+",
124 @ref operator -(const v_reg &a, const v_reg &b) "-",
125 @ref operator *(const v_reg &a, const v_reg &b) "*",
126 @ref operator /(const v_reg &a, const v_reg &b) "/",
129 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
132 @ref operator <<(const v_reg &a, int s) "<<",
133 @ref operator >>(const v_reg &a, int s) ">>",
134 @ref v_shl, @ref v_shr
137 @ref operator&(const v_reg &a, const v_reg &b) "&",
138 @ref operator |(const v_reg &a, const v_reg &b) "|",
139 @ref operator ^(const v_reg &a, const v_reg &b) "^",
140 @ref operator ~(const v_reg &a) "~"
143 @ref operator >(const v_reg &a, const v_reg &b) ">",
144 @ref operator >=(const v_reg &a, const v_reg &b) ">=",
145 @ref operator <(const v_reg &a, const v_reg &b) "<",
146 @ref operator <=(const v_reg &a, const v_reg &b) "<=",
147 @ref operator==(const v_reg &a, const v_reg &b) "==",
148 @ref operator !=(const v_reg &a, const v_reg &b) "!="
150 - min/max: @ref v_min, @ref v_max
154 Most of these operations return only one value.
156 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
157 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
161 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
162 - Absolute values: @ref v_abs, @ref v_absdiff
166 Different type conversions and casts:
168 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
169 - To float: @ref v_cvt_f32, @ref v_cvt_f64
170 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
172 ### Matrix operations
174 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
178 Most operations are implemented only for some subset of the available types, following matrices
179 shows the applicability of different operations to the types.
183 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
184 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
185 |load, store | x | x | x | x | x | x |
186 |interleave | x | x | x | x | x | x |
187 |expand | x | x | x | x | x | x |
188 |expand_q | x | x | | | | |
189 |add, sub | x | x | x | x | x | x |
190 |add_wrap, sub_wrap | x | x | x | x | | |
191 |mul | | | x | x | x | x |
192 |mul_expand | | | x | x | x | |
193 |compare | x | x | x | x | x | x |
194 |shift | | | x | x | x | x |
195 |dotprod | | | | x | | |
196 |logical | x | x | x | x | x | x |
197 |min, max | x | x | x | x | x | x |
198 |absdiff | x | x | x | x | x | x |
199 |reduce | | | | | x | x |
200 |mask | x | x | x | x | x | x |
201 |pack | x | x | x | x | x | x |
202 |pack_u | x | | x | | | |
203 |unpack | x | x | x | x | x | x |
204 |extract | x | x | x | x | x | x |
205 |rotate (lanes) | x | x | x | x | x | x |
206 |cvt_flt32 | | | | | | x |
207 |cvt_flt64 | | | | | | x |
208 |transpose4x4 | | | | | x | x |
212 | Operations\\Types | uint 64x2 | int 64x2 |
213 |-------------------|:-:|:-:|
214 |load, store | x | x |
219 |rotate (lanes) | x | x |
223 | Operations\\Types | float 32x4 | float 64x2 |
224 |-------------------|:-:|:-:|
225 |load, store | x | x |
239 |float math | x | x |
240 |transpose4x4 | x | |
242 |rotate (lanes) | x | x |
246 template<typename _Tp, int n> struct v_reg
249 typedef _Tp lane_type;
253 /** @brief Constructor
255 Initializes register with data from memory
256 @param ptr pointer to memory block with data for register */
257 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
259 /** @brief Constructor
261 Initializes register with two 64-bit values */
262 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
264 /** @brief Constructor
266 Initializes register with four 32-bit values */
267 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
269 /** @brief Constructor
271 Initializes register with eight 16-bit values */
272 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
273 _Tp s4, _Tp s5, _Tp s6, _Tp s7)
275 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
276 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
279 /** @brief Constructor
281 Initializes register with sixteen 8-bit values */
282 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
283 _Tp s4, _Tp s5, _Tp s6, _Tp s7,
284 _Tp s8, _Tp s9, _Tp s10, _Tp s11,
285 _Tp s12, _Tp s13, _Tp s14, _Tp s15)
287 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
288 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
289 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
290 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
293 /** @brief Default constructor
295 Does not initialize anything*/
298 /** @brief Copy constructor */
299 v_reg(const v_reg<_Tp, n> & r)
301 for( int i = 0; i < n; i++ )
304 /** @brief Access first value
306 Returns value of the first lane according to register type, for example:
308 v_int32x4 r(1, 2, 3, 4);
309 int v = r.get0(); // returns 1
311 uint64_t v = r.get0(); // returns 1
314 _Tp get0() const { return s[0]; }
317 _Tp get(const int i) const { return s[i]; }
318 v_reg<_Tp, n> high() const
322 for( i = 0; i < n/2; i++ )
330 static v_reg<_Tp, n> zero()
333 for( int i = 0; i < n; i++ )
338 static v_reg<_Tp, n> all(_Tp s)
341 for( int i = 0; i < n; i++ )
346 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
348 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
350 std::memcpy(&c.s[0], &s[0], bytes);
358 /** @brief Sixteen 8-bit unsigned integer values */
359 typedef v_reg<uchar, 16> v_uint8x16;
360 /** @brief Sixteen 8-bit signed integer values */
361 typedef v_reg<schar, 16> v_int8x16;
362 /** @brief Eight 16-bit unsigned integer values */
363 typedef v_reg<ushort, 8> v_uint16x8;
364 /** @brief Eight 16-bit signed integer values */
365 typedef v_reg<short, 8> v_int16x8;
366 /** @brief Four 32-bit unsigned integer values */
367 typedef v_reg<unsigned, 4> v_uint32x4;
368 /** @brief Four 32-bit signed integer values */
369 typedef v_reg<int, 4> v_int32x4;
370 /** @brief Four 32-bit floating point values (single precision) */
371 typedef v_reg<float, 4> v_float32x4;
372 /** @brief Two 64-bit floating point values (double precision) */
373 typedef v_reg<double, 2> v_float64x2;
374 /** @brief Two 64-bit unsigned integer values */
375 typedef v_reg<uint64, 2> v_uint64x2;
376 /** @brief Two 64-bit signed integer values */
377 typedef v_reg<int64, 2> v_int64x2;
379 //! @brief Helper macro
380 //! @ingroup core_hal_intrin_impl
381 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
382 template<typename _Tp, int n> inline v_reg<_Tp, n> \
383 operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
386 for( int i = 0; i < n; i++ ) \
387 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
390 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
391 operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
393 for( int i = 0; i < n; i++ ) \
394 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
398 /** @brief Add values
401 OPENCV_HAL_IMPL_BIN_OP(+)
403 /** @brief Subtract values
406 OPENCV_HAL_IMPL_BIN_OP(-)
408 /** @brief Multiply values
410 For 16- and 32-bit integer types and floating types. */
411 OPENCV_HAL_IMPL_BIN_OP(*)
413 /** @brief Divide values
415 For floating types only. */
416 OPENCV_HAL_IMPL_BIN_OP(/)
418 //! @brief Helper macro
419 //! @ingroup core_hal_intrin_impl
420 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
421 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
422 (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
425 typedef typename V_TypeTraits<_Tp>::int_type itype; \
426 for( int i = 0; i < n; i++ ) \
427 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
428 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
431 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
432 bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
434 typedef typename V_TypeTraits<_Tp>::int_type itype; \
435 for( int i = 0; i < n; i++ ) \
436 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
437 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
441 /** @brief Bitwise AND
443 Only for integer types. */
444 OPENCV_HAL_IMPL_BIT_OP(&)
446 /** @brief Bitwise OR
448 Only for integer types. */
449 OPENCV_HAL_IMPL_BIT_OP(|)
451 /** @brief Bitwise XOR
453 Only for integer types.*/
454 OPENCV_HAL_IMPL_BIT_OP(^)
456 /** @brief Bitwise NOT
458 Only for integer types.*/
459 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
462 for( int i = 0; i < n; i++ )
464 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
469 //! @brief Helper macro
470 //! @ingroup core_hal_intrin_impl
471 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
472 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
475 for( int i = 0; i < n; i++ ) \
476 c.s[i] = cfunc(a.s[i]); \
480 /** @brief Square root of elements
482 Only for floating point types.*/
483 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
486 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
487 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
488 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
489 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
492 /** @brief Absolute value of elements
494 Only for floating point types.*/
495 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
496 typename V_TypeTraits<_Tp>::abs_type)
498 /** @brief Round elements
500 Only for floating point types.*/
501 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
503 /** @brief Floor elements
505 Only for floating point types.*/
506 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
508 /** @brief Ceil elements
510 Only for floating point types.*/
511 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
513 /** @brief Truncate elements
515 Only for floating point types.*/
516 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
518 //! @brief Helper macro
519 //! @ingroup core_hal_intrin_impl
520 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
521 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
524 for( int i = 0; i < n; i++ ) \
525 c.s[i] = cfunc(a.s[i], b.s[i]); \
529 //! @brief Helper macro
530 //! @ingroup core_hal_intrin_impl
531 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
532 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
535 for( int i = 1; i < n; i++ ) \
536 c = cfunc(c, a.s[i]); \
540 /** @brief Choose min values for each pair
547 {min(A1,B1) min(A2,B2) ...}
549 For all types except 64-bit integer. */
550 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
552 /** @brief Choose max values for each pair
559 {max(A1,B1) max(A2,B2) ...}
561 For all types except 64-bit integer. */
562 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
564 /** @brief Find one min value
568 {A1 A2 A3 ...} => min(A1,A2,A3,...)
570 For 32-bit integer and 32-bit floating point types. */
571 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
573 /** @brief Find one max value
577 {A1 A2 A3 ...} => max(A1,A2,A3,...)
579 For 32-bit integer and 32-bit floating point types. */
580 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
582 static const unsigned char popCountTable[] =
584 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
585 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
586 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
587 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
588 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
589 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
590 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
591 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
592 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
593 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
594 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
595 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
596 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
597 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
598 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
599 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
601 /** @brief Count the 1 bits in the vector and return 4 values
605 {A1 A2 A3 ...} => popcount(A1)
607 Any types but result will be in v_uint32x4*/
608 template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
611 b = v_reinterpret_as_u8(a);
612 for( int i = 0; i < v_uint8x16::nlanes; i++ )
614 b.s[i] = popCountTable[b.s[i]];
617 for( int i = 0; i < v_uint32x4::nlanes; i++ )
619 c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
626 template<typename _Tp, int n>
627 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
628 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
630 for( int i = 0; i < n; i++ )
632 minval.s[i] = std::min(a.s[i], b.s[i]);
633 maxval.s[i] = std::max(a.s[i], b.s[i]);
638 //! @brief Helper macro
639 //! @ingroup core_hal_intrin_impl
640 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
641 template<typename _Tp, int n> \
642 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
644 typedef typename V_TypeTraits<_Tp>::int_type itype; \
646 for( int i = 0; i < n; i++ ) \
647 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
651 /** @brief Less-than comparison
653 For all types except 64-bit integer values. */
654 OPENCV_HAL_IMPL_CMP_OP(<)
656 /** @brief Greater-than comparison
658 For all types except 64-bit integer values. */
659 OPENCV_HAL_IMPL_CMP_OP(>)
661 /** @brief Less-than or equal comparison
663 For all types except 64-bit integer values. */
664 OPENCV_HAL_IMPL_CMP_OP(<=)
666 /** @brief Greater-than or equal comparison
668 For all types except 64-bit integer values. */
669 OPENCV_HAL_IMPL_CMP_OP(>=)
671 /** @brief Equal comparison
673 For all types except 64-bit integer values. */
674 OPENCV_HAL_IMPL_CMP_OP(==)
676 /** @brief Not equal comparison
678 For all types except 64-bit integer values. */
679 OPENCV_HAL_IMPL_CMP_OP(!=)
681 //! @brief Helper macro
682 //! @ingroup core_hal_intrin_impl
683 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
684 template<typename _Tp, int n> \
685 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
687 typedef _Tp2 rtype; \
689 for( int i = 0; i < n; i++ ) \
690 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
694 /** @brief Add values without saturation
696 For 8- and 16-bit integer values. */
697 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
699 /** @brief Subtract values without saturation
701 For 8- and 16-bit integer values. */
702 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
705 template<typename T> inline T _absdiff(T a, T b)
707 return a > b ? a - b : b - a;
711 /** @brief Absolute difference
713 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
716 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
717 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
719 For 8-, 16-, 32-bit integer source types. */
720 template<typename _Tp, int n>
721 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
723 typedef typename V_TypeTraits<_Tp>::abs_type rtype;
725 const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
726 for( int i = 0; i < n; i++ )
728 rtype ua = a.s[i] ^ mask;
729 rtype ub = b.s[i] ^ mask;
730 c.s[i] = _absdiff(ua, ub);
737 For 32-bit floating point values */
738 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
741 for( int i = 0; i < c.nlanes; i++ )
742 c.s[i] = _absdiff(a.s[i], b.s[i]);
748 For 64-bit floating point values */
749 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
752 for( int i = 0; i < c.nlanes; i++ )
753 c.s[i] = _absdiff(a.s[i], b.s[i]);
757 /** @brief Inversed square root
759 Returns \f$ 1/sqrt(a) \f$
760 For floating point types only. */
761 template<typename _Tp, int n>
762 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
765 for( int i = 0; i < n; i++ )
766 c.s[i] = 1.f/std::sqrt(a.s[i]);
772 Returns \f$ sqrt(a^2 + b^2) \f$
773 For floating point types only. */
774 template<typename _Tp, int n>
775 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
778 for( int i = 0; i < n; i++ )
779 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
783 /** @brief Square of the magnitude
785 Returns \f$ a^2 + b^2 \f$
786 For floating point types only. */
787 template<typename _Tp, int n>
788 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
791 for( int i = 0; i < n; i++ )
792 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
796 /** @brief Multiply and add
798 Returns \f$ a*b + c \f$
799 For floating point types and signed 32bit int only. */
800 template<typename _Tp, int n>
801 inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
802 const v_reg<_Tp, n>& c)
805 for( int i = 0; i < n; i++ )
806 d.s[i] = a.s[i]*b.s[i] + c.s[i];
810 /** @brief A synonym for v_fma */
811 template<typename _Tp, int n>
812 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
813 const v_reg<_Tp, n>& c)
815 return v_fma(a, b, c);
818 /** @brief Dot product of elements
820 Multiply values in two registers and sum adjacent result pairs.
823 {A1 A2 ...} // 16-bit
824 x {B1 B2 ...} // 16-bit
826 {A1B1+A2B2 ...} // 32-bit
828 Implemented only for 16-bit signed source type (v_int16x8).
830 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
831 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
833 typedef typename V_TypeTraits<_Tp>::w_type w_type;
834 v_reg<w_type, n/2> c;
835 for( int i = 0; i < (n/2); i++ )
836 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
840 /** @brief Dot product of elements
842 Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
845 {A1 A2 ...} // 16-bit
846 x {B1 B2 ...} // 16-bit
848 {A1B1+A2B2+C1 ...} // 32-bit
851 Implemented only for 16-bit signed source type (v_int16x8).
853 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
854 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
856 typedef typename V_TypeTraits<_Tp>::w_type w_type;
857 v_reg<w_type, n/2> s;
858 for( int i = 0; i < (n/2); i++ )
859 s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
863 /** @brief Multiply and expand
865 Multiply values two registers and store results in two registers with wider pack type.
869 x {E F G H} // 32-bit
876 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
877 v_uint64x2 c, d; // results
878 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
880 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
882 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
883 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
884 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
886 typedef typename V_TypeTraits<_Tp>::w_type w_type;
887 for( int i = 0; i < (n/2); i++ )
889 c.s[i] = (w_type)a.s[i]*b.s[i];
890 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
895 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
896 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
898 typedef typename V_TypeTraits<_Tp>::w_type w_type;
899 for( int i = 0; i < (n/2); i++ )
901 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
906 //! @brief Helper macro
907 //! @ingroup core_hal_intrin_impl
908 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
909 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
912 for( int i = 0; i < n; i++ ) \
913 c.s[i] = (_Tp)(a.s[i] shift_op imm); \
917 /** @brief Bitwise shift left
919 For 16-, 32- and 64-bit integer values. */
920 OPENCV_HAL_IMPL_SHIFT_OP(<< )
922 /** @brief Bitwise shift right
924 For 16-, 32- and 64-bit integer values. */
925 OPENCV_HAL_IMPL_SHIFT_OP(>> )
927 /** @brief Element shift left among vector
930 #define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
931 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
934 for (int i = 0; i < n; i++) \
936 int sIndex = i opA imm; \
937 if (0 <= sIndex && sIndex < n) \
939 b.s[i] = a.s[sIndex]; \
948 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
951 for (int i = 0; i < n; i++) \
953 int aIndex = i opA imm; \
954 int bIndex = i opA imm opB n; \
955 if (0 <= bIndex && bIndex < n) \
957 c.s[i] = b.s[bIndex]; \
959 else if (0 <= aIndex && aIndex < n) \
961 c.s[i] = a.s[aIndex]; \
971 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
972 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
974 /** @brief Sum packed values
978 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
980 For 32-bit integer and 32-bit floating point types.*/
981 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
983 typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
984 for( int i = 1; i < n; i++ )
989 /** @brief Sums all elements of each input vector, returns the vector of sums
993 result[0] = a[0] + a[1] + a[2] + a[3]
994 result[1] = b[0] + b[1] + b[2] + b[3]
995 result[2] = c[0] + c[1] + c[2] + c[3]
996 result[3] = d[0] + d[1] + d[2] + d[3]
999 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1000 const v_float32x4& c, const v_float32x4& d)
1003 r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
1004 r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
1005 r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
1006 r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
1010 /** @brief Get negative values mask
1012 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
1015 v_int32x4 r; // set to {-1, -1, 1, 1}
1016 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
1018 For all types except 64-bit. */
1019 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
1022 for( int i = 0; i < n; i++ )
1023 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
1027 /** @brief Check if all packed values are less than zero
1029 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1030 For all types except 64-bit. */
1031 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
1033 for( int i = 0; i < n; i++ )
1034 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
1039 /** @brief Check if any of packed values is less than zero
1041 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1042 For all types except 64-bit. */
1043 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
1045 for( int i = 0; i < n; i++ )
1046 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
1051 /** @brief Per-element select (blend operation)
1053 Return value will be built by combining values _a_ and _b_ using the following scheme:
1054 result[i] = mask[i] ? a[i] : b[i];
1056 @note: _mask_ element values are restricted to these values:
1057 - 0: select element from _b_
1058 - 0xff/0xffff/etc: select element from _a_
1059 (fully compatible with bitwise-based operator)
1061 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
1062 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1064 typedef V_TypeTraits<_Tp> Traits;
1065 typedef typename Traits::int_type int_type;
1067 for( int i = 0; i < n; i++ )
1069 int_type m = Traits::reinterpret_int(mask.s[i]);
1070 CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc
1071 c.s[i] = m ? a.s[i] : b.s[i];
1076 /** @brief Expand values to the wider pack type
1078 Copy contents of register to two registers with 2x wider pack type.
1081 int32x4 int64x2 int64x2
1082 {A B C D} ==> {A B} , {C D}
1084 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
1085 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
1086 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
1088 for( int i = 0; i < (n/2); i++ )
1091 b1.s[i] = a.s[i+(n/2)];
1096 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
1097 v_reinterpret_as_int(const v_reg<_Tp, n>& a)
1099 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
1100 for( int i = 0; i < n; i++ )
1101 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
1105 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
1106 v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
1108 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
1109 for( int i = 0; i < n; i++ )
1110 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
1115 /** @brief Interleave two vectors
1122 {A1 B1 A2 B2} and {A3 B3 A4 B4}
1124 For all types except 64-bit.
1126 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
1127 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
1130 for( i = 0; i < n/2; i++ )
1132 b0.s[i*2] = a0.s[i];
1133 b0.s[i*2+1] = a1.s[i];
1137 b1.s[i*2-n] = a0.s[i];
1138 b1.s[i*2-n+1] = a1.s[i];
1142 /** @brief Load register contents from memory
1144 @param ptr pointer to memory block with data
1145 @return register object
1147 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
1149 template<typename _Tp>
1150 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
1152 return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
1155 /** @brief Load register contents from memory (aligned)
1157 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
1159 template<typename _Tp>
1160 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
1162 return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
1165 /** @brief Load 64-bits of data to lower part (high part is undefined).
1167 @param ptr memory block containing data for first half (0..n/2)
1170 int lo[2] = { 1, 2 };
1171 v_int32x4 r = v_load_low(lo);
1174 template<typename _Tp>
1175 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
1177 v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
1178 for( int i = 0; i < c.nlanes/2; i++ )
1185 /** @brief Load register contents from two memory blocks
1187 @param loptr memory block containing data for first half (0..n/2)
1188 @param hiptr memory block containing data for second half (n/2..n)
1191 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
1192 v_int32x4 r = v_load_halves(lo, hi);
1195 template<typename _Tp>
1196 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1198 v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
1199 for( int i = 0; i < c.nlanes/2; i++ )
1202 c.s[i+c.nlanes/2] = hiptr[i];
1207 /** @brief Load register contents from memory with double expand
1209 Same as cv::v_load, but result pack type will be 2x wider than memory type.
1212 short buf[4] = {1, 2, 3, 4}; // type is int16
1213 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
1215 For 8-, 16-, 32-bit integer source types. */
1216 template<typename _Tp>
1217 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
1218 v_load_expand(const _Tp* ptr)
1220 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1221 v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
1222 for( int i = 0; i < c.nlanes; i++ )
1229 /** @brief Load register contents from memory with quad expand
1231 Same as cv::v_load_expand, but result type is 4 times wider than source.
1233 char buf[4] = {1, 2, 3, 4}; // type is int8
1234 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
1236 For 8-bit integer source types. */
1237 template<typename _Tp>
1238 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
1239 v_load_expand_q(const _Tp* ptr)
1241 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1242 v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
1243 for( int i = 0; i < c.nlanes; i++ )
1250 /** @brief Load and deinterleave (2 channels)
1252 Load data from memory deinterleave and store to 2 registers.
1255 {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
1257 For all types except 64-bit. */
1258 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1262 for( i = i2 = 0; i < n; i++, i2 += 2 )
1269 /** @brief Load and deinterleave (3 channels)
1271 Load data from memory deinterleave and store to 3 registers.
1274 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
1276 For all types except 64-bit. */
1277 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1278 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
1281 for( i = i3 = 0; i < n; i++, i3 += 3 )
1289 /** @brief Load and deinterleave (4 channels)
1291 Load data from memory deinterleave and store to 4 registers.
1294 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
1296 For all types except 64-bit. */
1297 template<typename _Tp, int n>
1298 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1299 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
1303 for( i = i4 = 0; i < n; i++, i4 += 4 )
1312 /** @brief Interleave and store (2 channels)
1314 Interleave and store data from 2 registers to memory.
1317 {A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
1319 For all types except 64-bit. */
1320 template<typename _Tp, int n>
1321 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1322 const v_reg<_Tp, n>& b,
1323 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
1326 for( i = i2 = 0; i < n; i++, i2 += 2 )
1333 /** @brief Interleave and store (3 channels)
1335 Interleave and store data from 3 registers to memory.
1338 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
1340 For all types except 64-bit. */
1341 template<typename _Tp, int n>
1342 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1343 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
1344 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
1347 for( i = i3 = 0; i < n; i++, i3 += 3 )
1355 /** @brief Interleave and store (4 channels)
1357 Interleave and store data from 4 registers to memory.
1360 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
1362 For all types except 64-bit. */
1363 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1364 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
1365 const v_reg<_Tp, n>& d,
1366 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
1369 for( i = i4 = 0; i < n; i++, i4 += 4 )
1378 /** @brief Store data to memory
1380 Store register contents to memory.
1383 REG {A B C D} ==> MEM {A B C D}
1385 Pointer can be unaligned. */
1386 template<typename _Tp, int n>
1387 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
1389 for( int i = 0; i < n; i++ )
1393 /** @brief Store data to memory (lower half)
1395 Store lower half of register contents to memory.
1398 REG {A B C D} ==> MEM {A B}
1400 template<typename _Tp, int n>
1401 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
1403 for( int i = 0; i < (n/2); i++ )
1407 /** @brief Store data to memory (higher half)
1409 Store higher half of register contents to memory.
1412 REG {A B C D} ==> MEM {C D}
1414 template<typename _Tp, int n>
1415 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
1417 for( int i = 0; i < (n/2); i++ )
1418 ptr[i] = a.s[i+(n/2)];
1421 /** @brief Store data to memory (aligned)
1423 Store register contents to memory.
1426 REG {A B C D} ==> MEM {A B C D}
1428 Pointer __should__ be aligned by 16-byte boundary. */
1429 template<typename _Tp, int n>
1430 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
1432 for( int i = 0; i < n; i++ )
1436 template<typename _Tp, int n>
1437 inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
1439 for( int i = 0; i < n; i++ )
1443 template<typename _Tp, int n>
1444 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
1446 for( int i = 0; i < n; i++ )
1450 /** @brief Combine vector from first elements of two vectors
1459 For all types except 64-bit. */
1460 template<typename _Tp, int n>
1461 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1464 for( int i = 0; i < (n/2); i++ )
1467 c.s[i+(n/2)] = b.s[i];
1472 /** @brief Combine vector from last elements of two vectors
1481 For all types except 64-bit. */
1482 template<typename _Tp, int n>
1483 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1486 for( int i = 0; i < (n/2); i++ )
1488 c.s[i] = a.s[i+(n/2)];
1489 c.s[i+(n/2)] = b.s[i+(n/2)];
1494 /** @brief Combine two vectors from lower and higher parts of two other vectors
1497 low = cv::v_combine_low(a, b);
1498 high = cv::v_combine_high(a, b);
1500 template<typename _Tp, int n>
1501 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1502 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
1504 for( int i = 0; i < (n/2); i++ )
1507 low.s[i+(n/2)] = b.s[i];
1508 high.s[i] = a.s[i+(n/2)];
1509 high.s[i+(n/2)] = b.s[i+(n/2)];
1513 /** @brief Vector extract
1519 ========================
1520 shift = 1 {A2 A3 A4 B1}
1521 shift = 2 {A3 A4 B1 B2}
1522 shift = 3 {A4 B1 B2 B3}
1524 Restriction: 0 <= shift < nlanes
1529 c = v_extract<2>(a, b);
1532 template<int s, typename _Tp, int n>
1533 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1536 const int shift = n - s;
1538 for (; i < shift; ++i)
1541 r.s[i] = b.s[i-shift];
1547 Rounds each value. Input type is float vector ==> output type is int vector.*/
1548 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
1551 for( int i = 0; i < n; i++ )
1552 c.s[i] = cvRound(a.s[i]);
1558 Floor each value. Input type is float vector ==> output type is int vector.*/
1559 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
1562 for( int i = 0; i < n; i++ )
1563 c.s[i] = cvFloor(a.s[i]);
1569 Ceil each value. Input type is float vector ==> output type is int vector.*/
1570 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
1573 for( int i = 0; i < n; i++ )
1574 c.s[i] = cvCeil(a.s[i]);
1580 Truncate each value. Input type is float vector ==> output type is int vector.*/
1581 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
1584 for( int i = 0; i < n; i++ )
1585 c.s[i] = (int)(a.s[i]);
1590 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
1593 for( int i = 0; i < n; i++ )
1595 c.s[i] = cvRound(a.s[i]);
1602 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
1605 for( int i = 0; i < n; i++ )
1607 c.s[i] = cvFloor(a.s[i]);
1614 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
1617 for( int i = 0; i < n; i++ )
1619 c.s[i] = cvCeil(a.s[i]);
1626 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
1629 for( int i = 0; i < n; i++ )
1631 c.s[i] = cvCeil(a.s[i]);
1637 /** @brief Convert to float
1639 Supported input type is cv::v_int32x4. */
1640 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
1643 for( int i = 0; i < n; i++ )
1644 c.s[i] = (float)a.s[i];
1648 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
1650 v_reg<float, n*2> c;
1651 for( int i = 0; i < n; i++ )
1653 c.s[i] = (float)a.s[i];
1654 c.s[i+n] = (float)b.s[i];
1659 /** @brief Convert to double
1661 Supported input type is cv::v_int32x4. */
1662 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
1665 for( int i = 0; i < n; i++ )
1666 c.s[i] = (double)a.s[i];
1670 /** @brief Convert to double
1672 Supported input type is cv::v_float32x4. */
1673 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
1676 for( int i = 0; i < n; i++ )
1677 c.s[i] = (double)a.s[i];
1681 template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
1684 for( int i = 0; i < n; i++ )
1685 c.s[i] = tab[idx.s[i]];
1689 template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
1692 for( int i = 0; i < n; i++ )
1693 c.s[i] = tab[idx.s[i]];
1697 template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
1700 for( int i = 0; i < n; i++ )
1701 c.s[i] = tab[idx.s[i]];
1705 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
1706 v_reg<float, n>& x, v_reg<float, n>& y)
1708 for( int i = 0; i < n; i++ )
1716 template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
1717 v_reg<double, n>& x, v_reg<double, n>& y)
1719 for( int i = 0; i < n; i++ )
1727 /** @brief Transpose 4x4 matrix
1742 template<typename _Tp>
1743 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
1744 const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
1745 v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
1746 v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
1748 b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
1749 b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
1750 b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
1751 b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
1754 //! @brief Helper macro
1755 //! @ingroup core_hal_intrin_impl
1756 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
1757 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
1759 //! @name Init with zero
1761 //! @brief Create new vector with zero elements
1762 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
1763 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
1764 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
1765 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
1766 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
1767 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
1768 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
1769 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
1770 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
1771 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
1774 //! @brief Helper macro
1775 //! @ingroup core_hal_intrin_impl
1776 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
1777 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
1779 //! @name Init with value
1781 //! @brief Create new vector with elements set to a specific value
1782 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
1783 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
1784 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
1785 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
1786 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
1787 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
1788 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
1789 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
1790 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
1791 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
1794 //! @brief Helper macro
1795 //! @ingroup core_hal_intrin_impl
1796 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
1797 template<typename _Tp0, int n0> inline _Tpvec \
1798 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
1799 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
1801 //! @name Reinterpret
1803 //! @brief Convert vector to different type without modifying underlying data.
1804 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
1805 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
1806 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
1807 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
1808 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
1809 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
1810 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
1811 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
1812 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
1813 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
1816 //! @brief Helper macro
1817 //! @ingroup core_hal_intrin_impl
1818 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
1819 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1822 //! @name Left shift
1824 //! @brief Shift left
1825 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
1826 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
1827 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
1828 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
1829 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
1830 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
1833 //! @brief Helper macro
1834 //! @ingroup core_hal_intrin_impl
1835 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
1836 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1839 //! @name Right shift
1841 //! @brief Shift right
1842 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
1843 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
1844 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
1845 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
1846 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
1847 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
1850 //! @brief Helper macro
1851 //! @ingroup core_hal_intrin_impl
1852 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
1853 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1856 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1857 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1861 //! @name Rounding shift
1863 //! @brief Rounding shift right
1864 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
1865 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
1866 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
1867 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
1868 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
1869 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
1872 //! @brief Helper macro
1873 //! @ingroup core_hal_intrin_impl
1874 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
1875 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1878 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1880 c.s[i] = cast<_Tpn>(a.s[i]); \
1881 c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
1888 //! @brief Pack values from two vectors to one
1890 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
1891 //! converts to corresponding unsigned type.
1893 //! - pack: for 16-, 32- and 64-bit integer input types
1894 //! - pack_u: for 16- and 32-bit signed integer input types
1896 //! @note All variants except 64-bit use saturation.
1897 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
1898 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
1899 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
1900 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
1901 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
1902 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
1903 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
1904 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
1907 //! @brief Helper macro
1908 //! @ingroup core_hal_intrin_impl
1909 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
1910 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
1913 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1915 c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1916 c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1921 //! @name Pack with rounding shift
1923 //! @brief Pack values from two vectors to one with rounding shift
1925 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
1926 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
1928 //! - pack: for 16-, 32- and 64-bit integer input types
1929 //! - pack_u: for 16- and 32-bit signed integer input types
1931 //! @note All variants except 64-bit use saturation.
1932 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
1933 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
1934 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
1935 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
1936 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
1937 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
1938 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
1939 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
1942 //! @brief Helper macro
1943 //! @ingroup core_hal_intrin_impl
1944 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
1945 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1947 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1948 ptr[i] = cast<_Tpn>(a.s[i]); \
1951 //! @name Pack and store
1953 //! @brief Store values from the input vector into memory with pack
1955 //! Values will be stored into memory with conversion to narrower type.
1956 //! Variant with _u_ suffix converts to corresponding unsigned type.
1958 //! - pack: for 16-, 32- and 64-bit integer input types
1959 //! - pack_u: for 16- and 32-bit signed integer input types
1961 //! @note All variants except 64-bit use saturation.
1962 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
1963 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
1964 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
1965 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
1966 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
1967 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
1968 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
1969 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
1972 //! @brief Helper macro
1973 //! @ingroup core_hal_intrin_impl
1974 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
1975 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
1977 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
1978 ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
1981 //! @name Pack and store with rounding shift
1983 //! @brief Store values from the input vector into memory with pack
1985 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
1986 //! memory. Variant with _u_ suffix converts to unsigned type.
1988 //! - pack: for 16-, 32- and 64-bit integer input types
1989 //! - pack_u: for 16- and 32-bit signed integer input types
1991 //! @note All variants except 64-bit use saturation.
1992 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
1993 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
1994 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
1995 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
1996 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
1997 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
1998 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
1999 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
2002 /** @brief Matrix multiplication
2009 {D0 D1 D2 D3} x |V3|
2010 ====================
2011 {R0 R1 R2 R3}, where:
2012 R0 = A0V0 + A1V1 + A2V2 + A3V3,
2013 R1 = B0V0 + B1V1 + B2V2 + B3V3
2017 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
2018 const v_float32x4& m1, const v_float32x4& m2,
2019 const v_float32x4& m3)
2021 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
2022 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
2023 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
2024 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
2027 /** @brief Matrix multiplication and add
2031 {A0 A1 A2 } |V0| |D0|
2032 {B0 B1 B2 } |V1| |D1|
2033 {C0 C1 C2 } x |V2| + |D2|
2034 ====================
2035 {R0 R1 R2 R3}, where:
2036 R0 = A0V0 + A1V1 + A2V2 + D0,
2037 R1 = B0V0 + B1V1 + B2V2 + D1
2041 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
2042 const v_float32x4& m1, const v_float32x4& m2,
2043 const v_float32x4& m3)
2045 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
2046 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
2047 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
2048 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
2051 inline void v_cleanup() {}
2055 //! @name Check SIMD support
2057 //! @brief Check CPU capability of SIMD operation
2058 static inline bool hasSIMD128()
2066 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END