1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_INTRIN_HPP
46 #define OPENCV_HAL_INTRIN_HPP
51 #include "opencv2/core/cvdef.h"
53 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
54 #define OPENCV_HAL_AND(a, b) ((a) & (b))
55 #define OPENCV_HAL_NOP(a) (a)
56 #define OPENCV_HAL_1ST(a, b) (a)
58 // unlike HAL API, which is in cv::hal,
59 // we put intrinsics into cv namespace to make its
60 // access from within opencv code more accessible
63 template<typename _Tp> struct V_TypeTraits
67 #define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
68 template<> struct V_TypeTraits<type> \
70 typedef type value_type; \
71 typedef int_type_ int_type; \
72 typedef abs_type_ abs_type; \
73 typedef uint_type_ uint_type; \
74 typedef w_type_ w_type; \
75 typedef q_type_ q_type; \
76 typedef sum_type_ sum_type; \
77 enum { nlanes128 = nlanes128_ }; \
79 static inline int_type reinterpret_int(type x) \
81 union { type l; int_type i; } v; \
86 static inline type reinterpret_from_int(int_type x) \
88 union { type l; int_type i; } v; \
94 CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
95 CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
96 CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
97 CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
98 CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
99 CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
100 CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
101 CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
102 CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
103 CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);
107 #ifdef CV_CPU_DISPATCH_MODE
108 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
109 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
110 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
112 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
113 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
114 #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
117 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
118 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
119 using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
133 #include "opencv2/core/hal/intrin_sse.hpp"
137 #include "opencv2/core/hal/intrin_neon.hpp"
141 #include "opencv2/core/hal/intrin_vsx.hpp"
145 #define CV_SIMD128_CPP 1
146 #include "opencv2/core/hal/intrin_cpp.hpp"
150 // AVX2 can be used together with SSE2, so
151 // we define those two sets of intrinsics at once.
152 // Most of the intrinsics do not conflict (the proper overloaded variant is
153 // resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
154 // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
155 // Correspondingly, the wide intrinsics (which are mapped to the "widest"
156 // available instruction set) will get vx_ prefix
157 // (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
160 #include "opencv2/core/hal/intrin_avx.hpp"
169 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
176 #ifndef CV_SIMD128_64F
177 #define CV_SIMD128_64F 0
184 #ifndef CV_SIMD256_64F
185 #define CV_SIMD256_64F 0
192 #ifndef CV_SIMD512_64F
193 #define CV_SIMD512_64F 0
198 #define CV_SIMD_64F CV_SIMD512_64F
199 #define CV_SIMD_WIDTH 64
202 #define CV_SIMD_64F CV_SIMD256_64F
203 #define CV_SIMD_WIDTH 32
205 #define CV_SIMD CV_SIMD128
206 #define CV_SIMD_64F CV_SIMD128_64F
207 #define CV_SIMD_WIDTH 16
210 //==================================================================================================
212 #define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
213 inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
214 inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
215 inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
216 inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
217 inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
218 inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
220 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
221 inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
223 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
224 inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
226 #define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
227 CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
228 CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
229 CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
231 #define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
232 CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
233 CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
234 CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
235 CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
236 CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
237 CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
238 CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
239 CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
240 CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
241 CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
242 CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
243 CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
244 CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
246 template<typename _Tp> struct V_RegTraits
250 #define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
251 template<> struct V_RegTraits<_reg> \
254 typedef _u_reg u_reg; \
255 typedef _w_reg w_reg; \
256 typedef _q_reg q_reg; \
257 typedef _int_reg int_reg; \
258 typedef _round_reg round_reg; \
261 #if CV_SIMD128 || CV_SIMD128_CPP
262 CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
263 CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
264 CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
265 CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
266 CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
267 CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
269 CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
271 CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
273 CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
274 CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
276 CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
279 CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8);
284 CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
285 CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
286 CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
287 CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
288 CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
289 CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
290 CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
291 CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
292 CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
293 CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
295 CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void);
300 typedef v_uint8x32 v_uint8;
301 typedef v_int8x32 v_int8;
302 typedef v_uint16x16 v_uint16;
303 typedef v_int16x16 v_int16;
304 typedef v_uint32x8 v_uint32;
305 typedef v_int32x8 v_int32;
306 typedef v_uint64x4 v_uint64;
307 typedef v_int64x4 v_int64;
308 typedef v_float32x8 v_float32;
310 typedef v_float64x4 v_float64;
313 typedef v_float16x16 v_float16;
314 CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
316 CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
317 CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
318 inline void vx_cleanup() { v256_cleanup(); }
320 typedef v_uint8x16 v_uint8;
321 typedef v_int8x16 v_int8;
322 typedef v_uint16x8 v_uint16;
323 typedef v_int16x8 v_int16;
324 typedef v_uint32x4 v_uint32;
325 typedef v_int32x4 v_int32;
326 typedef v_uint64x2 v_uint64;
327 typedef v_int64x2 v_int64;
328 typedef v_float32x4 v_float32;
330 typedef v_float64x2 v_float64;
333 typedef v_float16x8 v_float16;
334 CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
336 CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
338 CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
340 inline void vx_cleanup() { v_cleanup(); }
343 inline unsigned int trailingZeros32(unsigned int value) {
344 #if defined(_MSC_VER)
345 #if (_MSC_VER < 1700) || defined(_M_ARM)
346 unsigned long index = 0;
347 _BitScanForward(&index, value);
348 return (unsigned int)index;
350 return _tzcnt_u32(value);
352 #elif defined(__GNUC__) || defined(__GNUG__)
353 return __builtin_ctz(value);
354 #elif defined(__ICC) || defined(__INTEL_COMPILER)
355 return _bit_scan_forward(value);
356 #elif defined(__clang__)
357 return llvm.cttz.i32(value, true);
359 static const int MultiplyDeBruijnBitPosition[32] = {
360 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
361 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
362 return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
367 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END