endif()
if(WITH_INF_ENGINE OR HAVE_INF_ENGINE)
- status(" Inference Engine:" HAVE_INF_ENGINE THEN "YES (${INF_ENGINE_LIBRARIES} ${INF_ENGINE_INCLUDE_DIRS})" ELSE NO)
+ if(HAVE_INF_ENGINE)
+ set(__msg "YES")
+ if(DEFINED INF_ENGINE_VERSION)
+ set(__msg "YES (ver ${INF_ENGINE_VERSION})")
+ endif()
+ status(" Inference Engine:" "${__msg}")
+ status(" libs:" "${INF_ENGINE_LIBRARIES}")
+ status(" includes:" "${INF_ENGINE_INCLUDE_DIRS}")
+ else()
+ status(" Inference Engine:" "NO")
+ endif()
endif()
if(WITH_EIGEN OR HAVE_EIGEN)
return()
endif()
-set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
-if(ANDROID)
- set(CUDA_TARGET_OS_VARIANT "Android")
-endif()
-find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
+if(((NOT CMAKE_VERSION VERSION_LESS "3.9.0") # requires https://gitlab.kitware.com/cmake/cmake/merge_requests/663
+ OR OPENCV_CUDA_FORCE_EXTERNAL_CMAKE_MODULE)
+ AND NOT OPENCV_CUDA_FORCE_BUILTIN_CMAKE_MODULE)
+ ocv_update(CUDA_LINK_LIBRARIES_KEYWORD "LINK_PRIVATE")
+ find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
+else()
+ # Use OpenCV's patched "FindCUDA" module
+ set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
+ if(ANDROID)
+ set(CUDA_TARGET_OS_VARIANT "Android")
+ endif()
+ find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)
-list(REMOVE_AT CMAKE_MODULE_PATH 0)
+ list(REMOVE_AT CMAKE_MODULE_PATH 0)
+endif()
if(CUDA_FOUND)
set(HAVE_CUDA 1)
endmacro()
if(NOT HAVE_CXX11)
+ message(WARNING "DL Inference engine requires C++11. You can turn it on via ENABLE_CXX11=ON CMake flag.")
ie_fail()
endif()
+find_package(InferenceEngine QUIET)
+if(InferenceEngine_FOUND)
+ set(INF_ENGINE_LIBRARIES "${InferenceEngine_LIBRARIES}")
+ set(INF_ENGINE_INCLUDE_DIRS "${InferenceEngine_INCLUDE_DIRS}")
+ set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}")
+ set(HAVE_INF_ENGINE TRUE)
+ return()
+endif()
+
+ocv_check_environment_variables(INTEL_CVSDK_DIR INF_ENGINE_ROOT_DIR IE_PLUGINS_PATH)
+
if(NOT INF_ENGINE_ROOT_DIR OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp")
set(ie_root_paths "${INF_ENGINE_ROOT_DIR}")
- if(DEFINED ENV{INTEL_CVSDK_DIR})
- list(APPEND ie_root_paths "$ENV{INTEL_CVSDK_DIR}")
- list(APPEND ie_root_paths "$ENV{INTEL_CVSDK_DIR}/inference_engine")
- endif()
if(DEFINED INTEL_CVSDK_DIR)
- list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}")
- list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/inference_engine")
+ list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/")
+ list(APPEND ie_root_paths "${INTEL_CVSDK_DIR}/deployment_tools/inference_engine")
endif()
if(NOT ie_root_paths)
- list(APPEND ie_root_paths "/opt/intel/deeplearning_deploymenttoolkit/deployment_tools/inference_engine")
+ list(APPEND ie_root_paths "/opt/intel/computer_vision_sdk/deployment_tools/inference_engine/")
endif()
find_path(INF_ENGINE_ROOT_DIR include/inference_engine.hpp PATHS ${ie_root_paths})
+ if(INF_ENGINE_ROOT_DIR MATCHES "-NOTFOUND$")
+ unset(INF_ENGINE_ROOT_DIR CACHE)
+ endif()
endif()
set(INF_ENGINE_INCLUDE_DIRS "${INF_ENGINE_ROOT_DIR}/include" CACHE PATH "Path to Inference Engine include directory")
OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}"
OR NOT EXISTS "${INF_ENGINE_ROOT_DIR}/include/inference_engine.hpp"
)
+ message(WARNING "DL IE: Can't detect INF_ENGINE_ROOT_DIR location.")
ie_fail()
endif()
set(ie_lib_list inference_engine)
+if(NOT IS_ABSOLUTE "${IE_PLUGINS_PATH}")
+ set(IE_PLUGINS_PATH "${INF_ENGINE_ROOT_DIR}/${IE_PLUGINS_PATH}")
+endif()
+
link_directories(
- ${INTEL_CVSDK_DIR}/inference_engine/external/mkltiny_lnx/lib
- ${INTEL_CVSDK_DIR}/inference_engine/external/cldnn/lib
+ ${INF_ENGINE_ROOT_DIR}/external/mkltiny_lnx/lib
+ ${INF_ENGINE_ROOT_DIR}/external/cldnn/lib
)
foreach(lib ${ie_lib_list})
- find_library(${lib}
- NAMES ${lib}
- # For inference_engine
- HINTS ${IE_PLUGINS_PATH}
- HINTS "$ENV{IE_PLUGINS_PATH}"
- )
+ find_library(${lib} NAMES ${lib} HINTS ${IE_PLUGINS_PATH})
if(NOT ${lib})
+ message(WARNING "DL IE: Can't find library: '${lib}'")
ie_fail()
endif()
list(APPEND INF_ENGINE_LIBRARIES ${${lib}})
]
}
},
+ "namespaces_dict": {
+ "cv.fisheye": "fisheye"
+ },
"func_arg_fix" : {
"findFundamentalMat" : { "points1" : {"ctype" : "vector_Point2f"},
"points2" : {"ctype" : "vector_Point2f"} },
{
cvtColor(img, img, COLOR_BGR2GRAY);
}
- else
- {
- img.clone();
- }
int prev_sqr_size = 0;
{
if (flags & CALIB_CB_NORMALIZE_IMAGE)
{
+ img = img.clone();
equalizeHist(img, img);
}
/****************************************************************************************\
+* CV_NODISCARD attribute *
+* encourages the compiler to issue a warning if the return value is discarded (C++17) *
+\****************************************************************************************/
+#ifndef CV_NODISCARD
+# if defined(__GNUC__)
+# define CV_NODISCARD __attribute__((__warn_unused_result__)) // at least available with GCC 3.4
+# elif defined(__clang__) && defined(__has_attribute)
+# if __has_attribute(__warn_unused_result__)
+# define CV_NODISCARD __attribute__((__warn_unused_result__))
+# endif
+# endif
+#endif
+#ifndef CV_NODISCARD
+# define CV_NODISCARD /* nothing by default */
+#endif
+
+
+/****************************************************************************************\
* C++ 11 *
\****************************************************************************************/
#ifndef CV_CXX11
// access from within opencv code more accessible
namespace cv {
-#ifndef CV_DOXYGEN
-
-#ifdef CV_CPU_DISPATCH_MODE
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#else
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#endif
-
-
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
-using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-#endif
-
-//! @addtogroup core_hal_intrin
-//! @{
-
-//! @cond IGNORED
template<typename _Tp> struct V_TypeTraits
{
- typedef _Tp int_type;
- typedef _Tp uint_type;
- typedef _Tp abs_type;
- typedef _Tp sum_type;
-
- enum { delta = 0, shift = 0 };
-
- static int_type reinterpret_int(_Tp x) { return x; }
- static uint_type reinterpet_uint(_Tp x) { return x; }
- static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
-};
-
-template<> struct V_TypeTraits<uchar>
-{
- typedef uchar value_type;
- typedef schar int_type;
- typedef uchar uint_type;
- typedef uchar abs_type;
- typedef int sum_type;
-
- typedef ushort w_type;
- typedef unsigned q_type;
-
- enum { delta = 128, shift = 8 };
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<schar>
-{
- typedef schar value_type;
- typedef schar int_type;
- typedef uchar uint_type;
- typedef uchar abs_type;
- typedef int sum_type;
-
- typedef short w_type;
- typedef int q_type;
-
- enum { delta = 128, shift = 8 };
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<ushort>
-{
- typedef ushort value_type;
- typedef short int_type;
- typedef ushort uint_type;
- typedef ushort abs_type;
- typedef int sum_type;
-
- typedef unsigned w_type;
- typedef uchar nu_type;
-
- enum { delta = 32768, shift = 16 };
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<short>
-{
- typedef short value_type;
- typedef short int_type;
- typedef ushort uint_type;
- typedef ushort abs_type;
- typedef int sum_type;
-
- typedef int w_type;
- typedef uchar nu_type;
- typedef schar n_type;
-
- enum { delta = 128, shift = 8 };
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<unsigned>
-{
- typedef unsigned value_type;
- typedef int int_type;
- typedef unsigned uint_type;
- typedef unsigned abs_type;
- typedef unsigned sum_type;
-
- typedef uint64 w_type;
- typedef ushort nu_type;
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int>
-{
- typedef int value_type;
- typedef int int_type;
- typedef unsigned uint_type;
- typedef unsigned abs_type;
- typedef int sum_type;
-
- typedef int64 w_type;
- typedef short n_type;
- typedef ushort nu_type;
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<uint64>
-{
- typedef uint64 value_type;
- typedef int64 int_type;
- typedef uint64 uint_type;
- typedef uint64 abs_type;
- typedef uint64 sum_type;
-
- typedef unsigned nu_type;
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int64>
-{
- typedef int64 value_type;
- typedef int64 int_type;
- typedef uint64 uint_type;
- typedef uint64 abs_type;
- typedef int64 sum_type;
-
- typedef int nu_type;
-
- static int_type reinterpret_int(value_type x) { return (int_type)x; }
- static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
- static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-
-template<> struct V_TypeTraits<float>
-{
- typedef float value_type;
- typedef int int_type;
- typedef unsigned uint_type;
- typedef float abs_type;
- typedef float sum_type;
-
- typedef double w_type;
-
- static int_type reinterpret_int(value_type x)
- {
- Cv32suf u;
- u.f = x;
- return u.i;
- }
- static uint_type reinterpet_uint(value_type x)
- {
- Cv32suf u;
- u.f = x;
- return u.u;
- }
- static value_type reinterpret_from_int(int_type x)
- {
- Cv32suf u;
- u.i = x;
- return u.f;
- }
};
-template<> struct V_TypeTraits<double>
-{
- typedef double value_type;
- typedef int64 int_type;
- typedef uint64 uint_type;
- typedef double abs_type;
- typedef double sum_type;
- static int_type reinterpret_int(value_type x)
- {
- Cv64suf u;
- u.f = x;
- return u.i;
- }
- static uint_type reinterpet_uint(value_type x)
- {
- Cv64suf u;
- u.f = x;
- return u.u;
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+ template<> struct V_TypeTraits<type> \
+ { \
+ typedef type value_type; \
+ typedef int_type_ int_type; \
+ typedef abs_type_ abs_type; \
+ typedef uint_type_ uint_type; \
+ typedef w_type_ w_type; \
+ typedef q_type_ q_type; \
+ typedef sum_type_ sum_type; \
+ enum { nlanes128 = nlanes128_ }; \
+ \
+ static inline int_type reinterpret_int(type x) \
+ { \
+ union { type l; int_type i; } v; \
+ v.l = x; \
+ return v.i; \
+ } \
+ \
+ static inline type reinterpret_from_int(int_type x) \
+ { \
+ union { type l; int_type i; } v; \
+ v.i = x; \
+ return v.l; \
+ } \
}
- static value_type reinterpret_from_int(int_type x)
- {
- Cv64suf u;
- u.i = x;
- return u.f;
- }
-};
-template <typename T> struct V_SIMD128Traits
-{
- enum { nlanes = 16 / sizeof(T) };
-};
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);
-//! @endcond
+#ifndef CV_DOXYGEN
-//! @}
+#ifdef CV_CPU_DISPATCH_MODE
+ #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
+ #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
+ #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#else
+ #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
+ #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
+ #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#endif
-#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#endif
}
#ifdef CV_DOXYGEN
+# undef CV_AVX2
# undef CV_SSE2
# undef CV_NEON
# undef CV_VSX
+# undef CV_FP16
#endif
#if CV_SSE2
#else
+#define CV_SIMD128_CPP 1
#include "opencv2/core/hal/intrin_cpp.hpp"
#endif
-//! @addtogroup core_hal_intrin
-//! @{
+// AVX2 can be used together with SSE2, so
+// we define those two sets of intrinsics at once.
+// Most of the intrinsics do not conflict (the proper overloaded variant is
+// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
+// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
+// Correspondingly, the wide intrinsics (which are mapped to the "widest"
+// available instruction set) will get vx_ prefix
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
+#if CV_AVX2
-#ifndef CV_SIMD128
-//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
-#define CV_SIMD128 0
-#endif
+#include "opencv2/core/hal/intrin_avx.hpp"
-#ifndef CV_SIMD128_64F
-//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
-#define CV_SIMD128_64F 0
#endif
-//! @}
-
-//==================================================================================================
-
//! @cond IGNORED
namespace cv {
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
-template <typename R> struct V_RegTrait128;
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
-template <> struct V_RegTrait128<uchar> {
- typedef v_uint8x16 reg;
- typedef v_uint16x8 w_reg;
- typedef v_uint32x4 q_reg;
- typedef v_uint8x16 u_reg;
- static v_uint8x16 zero() { return v_setzero_u8(); }
- static v_uint8x16 all(uchar val) { return v_setall_u8(val); }
-};
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
-template <> struct V_RegTrait128<schar> {
- typedef v_int8x16 reg;
- typedef v_int16x8 w_reg;
- typedef v_int32x4 q_reg;
- typedef v_uint8x16 u_reg;
- static v_int8x16 zero() { return v_setzero_s8(); }
- static v_int8x16 all(schar val) { return v_setall_s8(val); }
-};
+#ifndef CV_SIMD256
+#define CV_SIMD256 0
+#endif
-template <> struct V_RegTrait128<ushort> {
- typedef v_uint16x8 reg;
- typedef v_uint32x4 w_reg;
- typedef v_int16x8 int_reg;
- typedef v_uint16x8 u_reg;
- static v_uint16x8 zero() { return v_setzero_u16(); }
- static v_uint16x8 all(ushort val) { return v_setall_u16(val); }
-};
+#ifndef CV_SIMD256_64F
+#define CV_SIMD256_64F 0
+#endif
-template <> struct V_RegTrait128<short> {
- typedef v_int16x8 reg;
- typedef v_int32x4 w_reg;
- typedef v_uint16x8 u_reg;
- static v_int16x8 zero() { return v_setzero_s16(); }
- static v_int16x8 all(short val) { return v_setall_s16(val); }
-};
+#ifndef CV_SIMD512
+#define CV_SIMD512 0
+#endif
-template <> struct V_RegTrait128<unsigned> {
- typedef v_uint32x4 reg;
- typedef v_uint64x2 w_reg;
- typedef v_int32x4 int_reg;
- typedef v_uint32x4 u_reg;
- static v_uint32x4 zero() { return v_setzero_u32(); }
- static v_uint32x4 all(unsigned val) { return v_setall_u32(val); }
-};
+#ifndef CV_SIMD512_64F
+#define CV_SIMD512_64F 0
+#endif
-template <> struct V_RegTrait128<int> {
- typedef v_int32x4 reg;
- typedef v_int64x2 w_reg;
- typedef v_uint32x4 u_reg;
- static v_int32x4 zero() { return v_setzero_s32(); }
- static v_int32x4 all(int val) { return v_setall_s32(val); }
-};
+#if CV_SIMD512
+ #define CV_SIMD 1
+ #define CV_SIMD_64F CV_SIMD512_64F
+ #define CV_SIMD_WIDTH 64
+#elif CV_SIMD256
+ #define CV_SIMD 1
+ #define CV_SIMD_64F CV_SIMD256_64F
+ #define CV_SIMD_WIDTH 32
+#else
+ #define CV_SIMD CV_SIMD128
+ #define CV_SIMD_64F CV_SIMD128_64F
+ #define CV_SIMD_WIDTH 16
+#endif
-template <> struct V_RegTrait128<uint64> {
- typedef v_uint64x2 reg;
- static v_uint64x2 zero() { return v_setzero_u64(); }
- static v_uint64x2 all(uint64 val) { return v_setall_u64(val); }
-};
+//==================================================================================================
-template <> struct V_RegTrait128<int64> {
- typedef v_int64x2 reg;
- static v_int64x2 zero() { return v_setzero_s64(); }
- static v_int64x2 all(int64 val) { return v_setall_s64(val); }
+#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+ inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
+ inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
+ inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
+ inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
+ inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
+ inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
+inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
+ CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
+
+template<typename _Tp> struct V_RegTraits
+{
};
-template <> struct V_RegTrait128<float> {
- typedef v_float32x4 reg;
- typedef v_int32x4 int_reg;
- typedef v_float32x4 u_reg;
- static v_float32x4 zero() { return v_setzero_f32(); }
- static v_float32x4 all(float val) { return v_setall_f32(val); }
-};
+#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
+ template<> struct V_RegTraits<_reg> \
+ { \
+ typedef _reg reg; \
+ typedef _u_reg u_reg; \
+ typedef _w_reg w_reg; \
+ typedef _q_reg q_reg; \
+ typedef _int_reg int_reg; \
+ typedef _round_reg round_reg; \
+ }
+#if CV_SIMD128 || CV_SIMD128_CPP
+ CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
+ CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
+ CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
+ CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
+ CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
+ CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
#if CV_SIMD128_64F
-template <> struct V_RegTrait128<double> {
- typedef v_float64x2 reg;
- typedef v_int32x4 int_reg;
- typedef v_float64x2 u_reg;
- static v_float64x2 zero() { return v_setzero_f64(); }
- static v_float64x2 all(double val) { return v_setall_f64(val); }
-};
+ CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
+#else
+ CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
+#endif
+ CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
+ CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
+#if CV_SIMD128_64F
+ CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
+#endif
+#if CV_FP16
+ CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8);
+#endif
+#endif
+
+#if CV_SIMD256
+ CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
+ CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
+ CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
+ CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
+ CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
+ CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
+ CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
+ CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
+ CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
+ CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
+#if CV_FP16
+ CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void);
+#endif
+#endif
+
+#if CV_SIMD256
+ typedef v_uint8x32 v_uint8;
+ typedef v_int8x32 v_int8;
+ typedef v_uint16x16 v_uint16;
+ typedef v_int16x16 v_int16;
+ typedef v_uint32x8 v_uint32;
+ typedef v_int32x8 v_int32;
+ typedef v_uint64x4 v_uint64;
+ typedef v_int64x4 v_int64;
+ typedef v_float32x8 v_float32;
+ #if CV_SIMD256_64F
+ typedef v_float64x4 v_float64;
+ #endif
+ #if CV_FP16
+ typedef v_float16x16 v_float16;
+ CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
+ #endif
+ CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
+ CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
+ inline void vx_cleanup() { v256_cleanup(); }
+#elif CV_SIMD128
+ typedef v_uint8x16 v_uint8;
+ typedef v_int8x16 v_int8;
+ typedef v_uint16x8 v_uint16;
+ typedef v_int16x8 v_int16;
+ typedef v_uint32x4 v_uint32;
+ typedef v_int32x4 v_int32;
+ typedef v_uint64x2 v_uint64;
+ typedef v_int64x2 v_int64;
+ typedef v_float32x4 v_float32;
+ #if CV_SIMD128_64F
+ typedef v_float64x2 v_float64;
+ #endif
+ #if CV_FP16
+ typedef v_float16x8 v_float16;
+ CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
+ #endif
+ CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
+ #if CV_SIMD128_64F
+ CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
+ #endif
+ inline void vx_cleanup() { v_cleanup(); }
#endif
inline unsigned int trailingZeros32(unsigned int value) {
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX_HPP
+#define OPENCV_HAL_INTRIN_AVX_HPP
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
+
+inline int _v_cvtsi256_si32(const __m256i& a)
+{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return _mm256_permute2x128_si256(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return _mm256_permute2f128_ps(a, b, imm); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return _mm256_permute2f128_pd(a, b, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return _mm256_permute4x64_epi64(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return _mm256_permute4x64_pd(a, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ return _mm256_extracti128_si256(v, 1); }
+
+inline __m128 _v256_extract_high(const __m256& v)
+{ return _mm256_extractf128_ps(v, 1); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return _mm256_extractf128_pd(v, 1); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return _mm256_castsi256_si128(v); }
+
+inline __m128 _v256_extract_low(const __m256& v)
+{ return _mm256_castps256_ps128(v); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return _mm256_castpd256_pd128(v); }
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+ typedef uchar lane_type;
+ enum { nlanes = 32 };
+ __m256i val;
+
+ explicit v_uint8x32(__m256i v) : val(v) {}
+ v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
+ uchar v4, uchar v5, uchar v6, uchar v7,
+ uchar v8, uchar v9, uchar v10, uchar v11,
+ uchar v12, uchar v13, uchar v14, uchar v15,
+ uchar v16, uchar v17, uchar v18, uchar v19,
+ uchar v20, uchar v21, uchar v22, uchar v23,
+ uchar v24, uchar v25, uchar v26, uchar v27,
+ uchar v28, uchar v29, uchar v30, uchar v31)
+ {
+ val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+ (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
+ (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+ (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+ (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+ (char)v28, (char)v29, (char)v30, (char)v31);
+ }
+ v_uint8x32() : val(_mm256_setzero_si256()) {}
+ uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
+};
+
+struct v_int8x32
+{
+ typedef schar lane_type;
+ enum { nlanes = 32 };
+ __m256i val;
+
+ explicit v_int8x32(__m256i v) : val(v) {}
+ v_int8x32(schar v0, schar v1, schar v2, schar v3,
+ schar v4, schar v5, schar v6, schar v7,
+ schar v8, schar v9, schar v10, schar v11,
+ schar v12, schar v13, schar v14, schar v15,
+ schar v16, schar v17, schar v18, schar v19,
+ schar v20, schar v21, schar v22, schar v23,
+ schar v24, schar v25, schar v26, schar v27,
+ schar v28, schar v29, schar v30, schar v31)
+ {
+ val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+ v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+ }
+ v_int8x32() : val(_mm256_setzero_si256()) {}
+ schar get0() const { return (schar)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint16x16
+{
+ typedef ushort lane_type;
+ enum { nlanes = 16 };
+ __m256i val;
+
+ explicit v_uint16x16(__m256i v) : val(v) {}
+ v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
+ ushort v4, ushort v5, ushort v6, ushort v7,
+ ushort v8, ushort v9, ushort v10, ushort v11,
+ ushort v12, ushort v13, ushort v14, ushort v15)
+ {
+ val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+ (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
+ (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+ }
+ v_uint16x16() : val(_mm256_setzero_si256()) {}
+ ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
+};
+
+struct v_int16x16
+{
+ typedef short lane_type;
+ enum { nlanes = 16 };
+ __m256i val;
+
+ explicit v_int16x16(__m256i v) : val(v) {}
+ v_int16x16(short v0, short v1, short v2, short v3,
+ short v4, short v5, short v6, short v7,
+ short v8, short v9, short v10, short v11,
+ short v12, short v13, short v14, short v15)
+ {
+ val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10, v11, v12, v13, v14, v15);
+ }
+ v_int16x16() : val(_mm256_setzero_si256()) {}
+ short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint32x8
+{
+ typedef unsigned lane_type;
+ enum { nlanes = 8 };
+ __m256i val;
+
+ explicit v_uint32x8(__m256i v) : val(v) {}
+ v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+ unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+ {
+ val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
+ (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+ }
+ v_uint32x8() : val(_mm256_setzero_si256()) {}
+ unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
+};
+
+struct v_int32x8
+{
+ typedef int lane_type;
+ enum { nlanes = 8 };
+ __m256i val;
+
+ explicit v_int32x8(__m256i v) : val(v) {}
+ v_int32x8(int v0, int v1, int v2, int v3,
+ int v4, int v5, int v6, int v7)
+ {
+ val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+ }
+ v_int32x8() : val(_mm256_setzero_si256()) {}
+ int get0() const { return _v_cvtsi256_si32(val); }
+};
+
+struct v_float32x8
+{
+ typedef float lane_type;
+ enum { nlanes = 8 };
+ __m256 val;
+
+ explicit v_float32x8(__m256 v) : val(v) {}
+ v_float32x8(float v0, float v1, float v2, float v3,
+ float v4, float v5, float v6, float v7)
+ {
+ val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+ }
+ v_float32x8() : val(_mm256_setzero_ps()) {}
+ float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
+};
+
+struct v_uint64x4
+{
+ typedef uint64 lane_type;
+ enum { nlanes = 4 };
+ __m256i val;
+
+ explicit v_uint64x4(__m256i v) : val(v) {}
+ v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+ { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+ v_uint64x4() : val(_mm256_setzero_si256()) {}
+ uint64 get0() const
+ { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+};
+
+struct v_int64x4
+{
+ typedef int64 lane_type;
+ enum { nlanes = 4 };
+ __m256i val;
+
+ explicit v_int64x4(__m256i v) : val(v) {}
+ v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+ { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
+ v_int64x4() : val(_mm256_setzero_si256()) {}
+ int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+};
+
+struct v_float64x4
+{
+ typedef double lane_type;
+ enum { nlanes = 4 };
+ __m256d val;
+
+ explicit v_float64x4(__m256d v) : val(v) {}
+ v_float64x4(double v0, double v1, double v2, double v3)
+ { val = _mm256_setr_pd(v0, v1, v2, v3); }
+ v_float64x4() : val(_mm256_setzero_pd()) {}
+ double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
+};
+
+struct v_float16x16
+{
+ typedef short lane_type;
+ enum { nlanes = 16 };
+ __m256i val;
+
+ explicit v_float16x16(__m256i v) : val(v) {}
+ v_float16x16(short v0, short v1, short v2, short v3,
+ short v4, short v5, short v6, short v7,
+ short v8, short v9, short v10, short v11,
+ short v12, short v13, short v14, short v15)
+ {
+ val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+ }
+ v_float16x16() : val(_mm256_setzero_si256()) {}
+ short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); }
+inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); }
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
+ inline _Tpvec v256_load(const _Tp* ptr) \
+ { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
+ inline _Tpvec v256_load_aligned(const _Tp* ptr) \
+ { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
+ inline _Tpvec v256_load_low(const _Tp* ptr) \
+ { \
+ __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
+ return _Tpvec(_mm256_castsi128_si256(v128)); \
+ } \
+ inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+ { \
+ __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
+ __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
+ return _Tpvec(_v256_combine(vlo, vhi)); \
+ } \
+ inline void v_store(_Tp* ptr, const _Tpvec& a) \
+ { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
+ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+ { _mm256_store_si256((__m256i*)ptr, a.val); } \
+ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+ { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
+ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+ { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16, short)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8, unsigned)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8, int)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
+ inline _Tpvec v256_load(const _Tp* ptr) \
+ { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
+ inline _Tpvec v256_load_aligned(const _Tp* ptr) \
+ { return _Tpvec(_mm256_load_##suffix(ptr)); } \
+ inline _Tpvec v256_load_low(const _Tp* ptr) \
+ { \
+ return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
+ (_mm_loadu_##suffix(ptr))); \
+ } \
+ inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+ { \
+ halfreg vlo = _mm_loadu_##suffix(ptr0); \
+ halfreg vhi = _mm_loadu_##suffix(ptr1); \
+ return _Tpvec(_v256_combine(vlo, vhi)); \
+ } \
+ inline void v_store(_Tp* ptr, const _Tpvec& a) \
+ { _mm256_storeu_##suffix(ptr, a.val); } \
+ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+ { _mm256_store_##suffix(ptr, a.val); } \
+ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+ { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
+ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+ { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float, ps, __m128)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
+
+#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+ inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
+ { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
+ inline _Tpvec v256_setzero_##suffix() \
+ { return _Tpvec(_mm256_setzero_si256()); } \
+ inline _Tpvec v256_setall_##suffix(_Tp v) \
+ { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
+
+OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8, char)
+OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8, char)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16, short)
+OPENCV_HAL_IMPL_AVX_INIT(v_int16x16, short, s16, epi16, short)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8, unsigned, u32, epi32, int)
+OPENCV_HAL_IMPL_AVX_INIT(v_int32x8, int, s32, epi32, int)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64)
+OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
+
+#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+ inline _Tpvec v256_setzero_##suffix() \
+ { return _Tpvec(_mm256_setzero_##zsuffix()); } \
+ inline _Tpvec v256_setall_##suffix(_Tp v) \
+ { return _Tpvec(_mm256_set1_##zsuffix(v)); } \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
+ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
+
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float, f32, ps, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castpd_ps(a.val)); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_castps_pd(a.val)); }
+
+inline v_float16x16 v256_load_f16(const short* ptr)
+{ return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); }
+inline v_float16x16 v256_load_f16_aligned(const short* ptr)
+{ return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
+
+inline void v_store(short* ptr, const v_float16x16& a)
+{ _mm256_storeu_si256((__m256i*)ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x16& a)
+{ _mm256_store_si256((__m256i*)ptr, a.val); }
+
+/* Recombine */
+/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
+ inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(perm(a.val, b.val, 0x20)); } \
+ inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(perm(a.val, b.val, 0x31)); } \
+ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
+ _Tpvec& c, _Tpvec& d) \
+ { c = v_combine_low(a, b); d = v_combine_high(a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix) \
+ OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256) \
+ inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, \
+ _Tpvec& b0, _Tpvec& b1) \
+ { \
+ __m256i v0 = _v256_shuffle_odd_64(a0.val); \
+ __m256i v1 = _v256_shuffle_odd_64(a1.val); \
+ b0.val = _mm256_unpacklo_##suffix(v0, v1); \
+ b1.val = _mm256_unpackhi_##suffix(v0, v1); \
+ }
+
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32, epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32, epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8, epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8, epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4, epi64)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4, epi64)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
+
+inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
+{
+ __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
+ __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
+ v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
+}
+
+inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
+{
+ __m256d v0 = _v_shuffle_odd_64(a0.val);
+ __m256d v1 = _v_shuffle_odd_64(a1.val);
+ b0.val = _mm256_unpacklo_pd(v0, v1);
+ b1.val = _mm256_unpackhi_pd(v0, v1);
+}*/
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
+ inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
+ inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
+
+// blend
+#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
+ template<int m> \
+ inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
+
+template<int m>
+inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
+{
+ enum {M0 = m};
+ enum {M1 = (M0 | (M0 << 2)) & 0x33};
+ enum {M2 = (M1 | (M1 << 1)) & 0x55};
+ enum {MM = M2 | (M2 << 1)};
+ return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
+}
+template<int m>
+inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
+{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
+
+// shuffle
+// todo: emluate 64bit
+#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
+ template<int m> \
+ inline _Tpvec v256_shuffle(const _Tpvec& a) \
+ { return _Tpvec(_mm256_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
+
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+ ab0 = v256_unpacklo(a, b);
+ ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0b11110000)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v256_blend<0b11110000>(a, b); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v256_blend<0b1100>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x21>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
+
+// ZIP
+#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
+ inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+ { return v256_permute2x128<0x20>(a, b); } \
+ inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+ { return v256_permute2x128<0x31>(a, b); } \
+ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
+ _Tpvec& c, _Tpvec& d) \
+ { \
+ _Tpvec a1b0 = v256_alignr_128(a, b); \
+ c = v256_combine_diagonal(a, a1b0); \
+ d = v256_combine_diagonal(a1b0, b); \
+ } \
+ inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
+ _Tpvec& ab0, _Tpvec& ab1) \
+ { \
+ _Tpvec ab0ab2, ab1ab3; \
+ v256_zip(a, b, ab0ab2, ab1ab3); \
+ v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
+ }
+
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
+ inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(intrin(a.val, b.val)); } \
+ inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+ { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+ v_int32x8& c, v_int32x8& d)
+{
+ v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
+
+ v_int16x16 v0, v1;
+ v_zip(a * b, vhi, v0, v1);
+
+ c = v_reinterpret_as_s32(v0);
+ d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+ v_uint32x8& c, v_uint32x8& d)
+{
+ v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
+
+ v_uint16x16 v0, v1;
+ v_zip(a * b, vhi, v0, v1);
+
+ c = v_reinterpret_as_u32(v0);
+ d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+ v_uint64x4& c, v_uint64x4& d)
+{
+ __m256i v0 = _mm256_mul_epu32(a.val, b.val);
+ __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+ v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
+ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
+ inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+ { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
+ inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+ { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
+ inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+ { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
+ inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+ { return _Tpsvec(srai(a.val, imm)); } \
+ template<int imm> \
+ inline _Tpuvec v_shl(const _Tpuvec& a) \
+ { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
+ template<int imm> \
+ inline _Tpsvec v_shl(const _Tpsvec& a) \
+ { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
+ template<int imm> \
+ inline _Tpuvec v_shr(const _Tpuvec& a) \
+ { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
+ template<int imm> \
+ inline _Tpsvec v_shr(const _Tpsvec& a) \
+ { return _Tpsvec(srai(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
+
+inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
+{
+ __m256i d = _mm256_set1_epi64x((int64)1 << 63);
+ __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
+ return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
+}
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
+ OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
+ OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
+ OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
+ inline _Tpvec operator ~ (const _Tpvec& a) \
+ { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
+ inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
+ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+ { return ~(a == b); } \
+ inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+ { return b > a; } \
+ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+ { return ~(a < b); } \
+ inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+ { return b >= a; }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
+ inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+ { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
+ inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+ { \
+ __m256i smask = _mm256_set1_##suffix(sbit); \
+ return _Tpuvec(_mm256_cmpgt_##suffix( \
+ _mm256_xor_si256(a.val, smask), \
+ _mm256_xor_si256(b.val, smask))); \
+ } \
+ inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+ { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
+ inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+ { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
+ OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
+ OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (char)-128)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
+ inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
+ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+ { return ~(a == b); }
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
+ inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+ { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
+ OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
+ OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
+ OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
+ OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
+ OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
+ OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+ __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+
+ switch(imm)
+ {
+ case 0: return a;
+ case 32: return b;
+ case 16: return v_uint8x32(swap);
+ }
+
+ if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
+ if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
+
+ return v_uint8x32();
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+ __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
+
+ switch(imm)
+ {
+ case 0: return a;
+ case 32: return b;
+ case 16: return v_uint8x32(swap);
+ }
+
+ if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
+ if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
+
+ return v_uint8x32();
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+ v_uint8x32 res;
+ // ESAC control[3] ? [127:0] = 0
+ __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
+
+ if (imm == 0)
+ return a;
+ if (imm == 16)
+ res.val = swapz;
+ else if (imm < 16)
+ res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
+ else if (imm < 32)
+ res.val = _mm256_slli_si256(swapz, imm - 16);
+ else
+ return v_uint8x32();
+ return res;
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+ v_uint8x32 res;
+ // ESAC control[3] ? [127:0] = 0
+ __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
+
+ if (imm == 0)
+ return a;
+ if (imm == 16)
+ res.val = swapz;
+ else if (imm < 16)
+ res.val = _mm256_alignr_epi8(swapz, a.val, imm);
+ else if (imm < 32)
+ res.val = _mm256_srli_si256(swapz, imm - 16);
+ else
+ return v_uint8x32();
+ return res;
+}
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
+ template<int imm> \
+ inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ const int w = sizeof(typename _Tpvec::lane_type); \
+ v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a), \
+ v_reinterpret_as_u8(b)); \
+ return _Tpvec(cast(ret.val)); \
+ } \
+ template<int imm> \
+ inline _Tpvec intrin(const _Tpvec& a) \
+ { \
+ const int w = sizeof(typename _Tpvec::lane_type); \
+ v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
+ return _Tpvec(cast(ret.val)); \
+ }
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
+ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
+ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+ inline sctype v_reduce_##func(const _Tpvec& a) \
+ { \
+ __m128i v0 = _v256_extract_low(a.val); \
+ __m128i v1 = _v256_extract_high(a.val); \
+ v0 = intrin(v0, v1); \
+ v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
+ v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
+ v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
+ return (sctype) _mm_cvtsi128_si32(v0); \
+ }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, min, _mm_min_epi16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, max, _mm_max_epi16)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+ inline sctype v_reduce_##func(const _Tpvec& a) \
+ { \
+ __m128i v0 = _v256_extract_low(a.val); \
+ __m128i v1 = _v256_extract_high(a.val); \
+ v0 = intrin(v0, v1); \
+ v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
+ v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
+ return (sctype) _mm_cvtsi128_si32(v0); \
+ }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, min, _mm_min_epi32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
+ inline float v_reduce_##func(const v_float32x8& a) \
+ { \
+ __m128 v0 = _v256_extract_low(a.val); \
+ __m128 v1 = _v256_extract_high(a.val); \
+ v0 = intrin(v0, v1); \
+ v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
+ v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
+ return _mm_cvtss_f32(v0); \
+ }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
+
+inline ushort v_reduce_sum(const v_uint16x16& a)
+{
+ __m128i a0 = _v256_extract_low(a.val);
+ __m128i a1 = _v256_extract_high(a.val);
+
+ __m128i s0 = _mm_adds_epu16(a0, a1);
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
+
+ return (ushort)_mm_cvtsi128_si32(s0);
+}
+
+inline short v_reduce_sum(const v_int16x16& a)
+{
+ __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
+ s0 = _mm256_hadds_epi16(s0, s0);
+ s0 = _mm256_hadds_epi16(s0, s0);
+
+ __m128i s1 = _v256_extract_high(s0);
+ s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
+
+ return (short)_mm_cvtsi128_si32(s1);
+}
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+ __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
+ s0 = _mm256_hadd_epi32(s0, s0);
+
+ __m128i s1 = _v256_extract_high(s0);
+ s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
+
+ return _mm_cvtsi128_si32(s1);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+ __m256 s0 = _mm256_hadd_ps(a.val, a.val);
+ s0 = _mm256_hadd_ps(s0, s0);
+
+ __m128 s1 = _v256_extract_high(s0);
+ s1 = _mm_add_ps(_v256_extract_low(s0), s1);
+
+ return _mm_cvtss_f32(s1);
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+ const v_float32x8& c, const v_float32x8& d)
+{
+ __m256 ab = _mm256_hadd_ps(a.val, b.val);
+ __m256 cd = _mm256_hadd_ps(c.val, d.val);
+ return v_float32x8(_mm256_hadd_ps(ab, cd));
+}
+
+/** Popcount **/
+#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
+ inline v_uint32x8 v_popcount(const _Tpvec& a) \
+ { \
+ const v_uint32x8 m1 = v256_setall_u32(0x55555555); \
+ const v_uint32x8 m2 = v256_setall_u32(0x33333333); \
+ const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \
+ v_uint32x8 p = v_reinterpret_as_u32(a); \
+ p = ((p >> 1) & m1) + (p & m1); \
+ p = ((p >> 2) & m2) + (p & m2); \
+ p = ((p >> 4) & m4) + (p & m4); \
+ p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \
+ return p; \
+ }
+
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
+
+/** Mask **/
+inline int v_signmask(const v_int8x32& a)
+{ return _mm256_movemask_epi8(a.val); }
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{
+ v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val));
+ return v_signmask(v) & 255;
+}
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x8& a)
+{
+ __m256i a16 = _mm256_packs_epi32(a.val, a.val);
+ v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16));
+ return v_signmask(v) & 15;
+}
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return _mm256_movemask_ps(a.val); }
+inline int v_signmask(const v_float64x4& a)
+{ return _mm256_movemask_pd(a.val); }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \
+ inline bool v_check_all(const _Tpvec& a) \
+ { \
+ int mask = v_signmask(v_reinterpret_as_s8(a)); \
+ return and_op(mask, allmask) == allmask; \
+ } \
+ inline bool v_check_any(const _Tpvec& a) \
+ { \
+ int mask = v_signmask(v_reinterpret_as_s8(a)); \
+ return and_op(mask, allmask) != 0; \
+ }
+
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, OPENCV_HAL_1ST, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, OPENCV_HAL_1ST, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16, OPENCV_HAL_AND, (int)0xaaaa)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, OPENCV_HAL_AND, (int)0x8888)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, OPENCV_HAL_AND, (int)0x8888)
+
+#define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
+ inline bool v_check_all(const _Tpvec& a) \
+ { \
+ int mask = v_signmask(a); \
+ return mask == allmask; \
+ } \
+ inline bool v_check_any(const _Tpvec& a) \
+ { \
+ int mask = v_signmask(a); \
+ return mask != 0; \
+ }
+
+OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
+ inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+ { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
+ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+ { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
+ inline _Tpvec v_sqrt(const _Tpvec& x) \
+ { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
+ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+ { return v_fma(a, a, b * b); } \
+ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+ { return v_sqrt(v_fma(a, a, b*b)); }
+
+OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{
+ v_float32x8 half = x * v256_setall_f32(0.5);
+ v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
+ // todo: _mm256_fnmsub_ps
+ t *= v256_setall_f32(1.5) - ((t * t) * half);
+ return t;
+}
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{
+ return v256_setall_f64(1.) / v_sqrt(x);
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
+ inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
+ { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
+OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_add_wrap(a - b, b - a); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_add_wrap(a - b, b - a); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{
+ v_int8x32 d = v_sub_wrap(a, b);
+ v_int8x32 m = a < b;
+ return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{
+ v_int32x8 d = a - b;
+ v_int32x8 m = a < b;
+ return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(a - b); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(a - b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvtps_epi32(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+ __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
+ return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
+
+#if CV_FP16
+inline v_float32x8 v_cvt_f32(const v_float16x16& a)
+{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); }
+
+inline v_float32x8 v_cvt_f32_high(const v_float16x16& a)
+{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); }
+
+inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b)
+{
+ __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0);
+ return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1));
+}
+#endif
+
+////////////// Lookup table access ////////////////////
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[8];
+ v_store_aligned(idx, idxvec);
+ return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+ tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[8];
+ v_store_aligned(idx, idxvec);
+ return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+ tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[8];
+ v_store_aligned(idx, idxvec);
+ return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+ int CV_DECL_ALIGNED(32) idx[8];
+ v_store_aligned(idx, idxvec);
+ __m128 z = _mm_setzero_ps();
+ __m128 xy01, xy45, xy23, xy67;
+ xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
+ xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
+ xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
+ xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
+ __m256 xy0145 = _v256_combine(xy01, xy45);
+ xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
+ xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
+ xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
+ xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
+ __m256 xy2367 = _v256_combine(xy23, xy67);
+
+ __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
+ __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
+
+ x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
+ y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_low(idx, idxvec);
+ __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+ __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
+ __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+ __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
+ __m256d xy02 = _v256_combine(xy0, xy2);
+ __m256d xy13 = _v256_combine(xy1, xy3);
+
+ x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
+ y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
+}
+
+////////// Matrix operations /////////
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b) + c; }
+
+#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
+ v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+ const v_float32x8& m1, const v_float32x8& m2,
+ const v_float32x8& m3)
+{
+ v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+ v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+ v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+ v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
+ return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+ const v_float32x8& m1, const v_float32x8& m2,
+ const v_float32x8& a)
+{
+ v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+ v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+ v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+ return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+ inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+ const _Tpvec& a2, const _Tpvec& a3, \
+ _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
+ { \
+ __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
+ __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
+ __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
+ __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
+ b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
+ b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
+ b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
+ b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
+ }
+
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
+ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+ { \
+ b0.val = intrin(_v256_extract_low(a.val)); \
+ b1.val = intrin(_v256_extract_high(a.val)); \
+ } \
+ inline _Tpwvec v256_load_expand(const _Tp* ptr) \
+ { \
+ __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
+ return _Tpwvec(intrin(a)); \
+ }
+
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
+ inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
+ { \
+ __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
+ return _Tpvec(intrin(a)); \
+ }
+
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+ // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+ v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+ return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+ v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+ v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+ v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+ v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+ return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+ v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+ v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+ v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+ return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+ v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+ v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{
+ // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
+ v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+ return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+ v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+ v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+ v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{
+ v_int32x8 delta = v256_setall_s32(1 << (n-1));
+ return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+ v_int32x8 delta = v256_setall_s32(1 << (n-1));
+ v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{
+ v_int32x8 delta = v256_setall_s32(1 << (n-1));
+ return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+ v_int32x8 delta = v256_setall_s32(1 << (n-1));
+ v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+ __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+ __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
+ __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
+ return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+ __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+ v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+ v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+ return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+ v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+ v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{
+ v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+ return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+ v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+ v_pack_store(ptr, (a + delta) >> n);
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
+ template<int s> \
+ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+ { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
+
+
+/** Reinterpret **/
+// its up there with load and store operations
+
+/* de&interleave */
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
+ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
+ { return v256_load_deinterleave_##suffix(ptr, a, b); } \
+ inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
+ { return v256_store_interleave_2ch(ptr, a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
+ inline void v_load_deinterleave \
+ (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) \
+ { return v256_load_deinterleave_##suffix(ptr, a, b, c); } \
+ inline void v_store_interleave \
+ (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c) \
+ { return v256_store_interleave_##suffix(ptr, a, b, c); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) \
+ inline void v_load_deinterleave \
+ (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+ { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); } \
+ inline void v_store_interleave \
+ (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
+ { return v256_store_interleave_##suffix(ptr, a, b, c, d); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
+ OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
+ OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
+ OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
+ OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
+
+/* **** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
+{
+ _Tpvec ab0, ab1;
+ v_zip(a, b, ab0, ab1);
+ v_store(ptr, ab0);
+ v_store(ptr + _Tpvec::nlanes, ab1);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+ _Tpvec ab0 = v256_load(ptr);
+ _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
+ _Tpvec ab00, ab11;
+ v_recombine(ab0, ab1, ab00, ab11);
+ v256_zip(ab00, ab11, a, b);
+}
+
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+{
+ _Tpvec abc0 = v256_load(ptr);
+ _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
+ _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
+
+ _Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
+ _Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
+ _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
+
+ a = v256_unpacklo(ab0, ac1);
+ c = v256_unpackhi(ac1, bc1);
+ b = v256_alignr_64(bc1, ab0);
+}
+
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+ _Tpvec ab0 = v256_unpacklo(a, b);
+ _Tpvec bc1 = v256_unpackhi(b, c);
+ _Tpvec ca10 = v256_swap_halves(v256_blend<0b1010>(c, a));
+
+ v_store(ptr, v256_combine_diagonal(ab0, ca10));
+ v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
+ v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
+}
+
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+ _Tpvec abcd0 = v256_load(ptr);
+ _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
+ _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
+ _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
+
+ _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
+ _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
+
+ _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
+ _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
+ _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
+ _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
+
+ v256_zip(ab0, ab1, a, b);
+ v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{
+ _Tpvec ab0, ab1, cd0, cd1;
+ v256_zip(a, b, ab0, ab1);
+ v256_zip(c, d, cd0, cd1);
+
+ _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
+ _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
+
+ v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
+ v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
+ v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
+ v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
+}
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4, uint64, l4)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4, int64, l4)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
+
+/* **** **** */
+//
+inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
+{
+ v_float32x8 ab0 = v256_load(ptr);
+ v_float32x8 ab1 = v256_load(ptr + 8);
+
+ v_float32x8 ab0ab2, ab1ab3;
+ v_recombine(ab0, ab1, ab0ab2, ab1ab3);
+
+ a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
+ b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+ v_float32x8 fa, fb;
+ v256_load_deinterleave_l8((float*)ptr, fa, fb);
+ a.val = v_reinterpret_as_u32(fa).val;
+ b.val = v_reinterpret_as_u32(fb).val;
+}
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+ _Tpvec ab0, ab1, bc0, bc1;
+ v256_zip(a, b, ab0, ab1);
+ v256_zip(b, c, bc0, bc1);
+
+ _Tpvec cazg = v256_blend<0b10101010>(c, a);
+ _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
+ _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
+ _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0b11001100>(ab1, bc0));
+
+ _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
+ _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
+ _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+
+ v_store(ptr, abc0);
+ v_store(ptr + _Tpvec::nlanes, abc1);
+ v_store(ptr + _Tpvec::nlanes * 2, abc2);
+}
+
+inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
+{
+ v_float32x8 ab0, ab1, bc0, bc1;
+ v256_zip(a, b, ab0, ab1);
+ v256_zip(b, c, bc0, bc1);
+
+ v_float32x8 cazg = v256_blend<0b10101010>(c, a);
+ v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
+ v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
+
+ v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
+ v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
+
+ v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
+ v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
+ v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+
+ v_store(ptr, abc0);
+ v_store(ptr + 8, abc1);
+ v_store(ptr + 16, abc2);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+{
+ _Tpvec abc02 = v256_load(ptr);
+ _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
+ _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
+
+ _Tpvec abc2 = v256_alignr_128(abc02, abc20);
+ _Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
+
+ a = v256_blend<0b10010010>(abc0, abc1);
+ a = v256_blend<0b01000100>(a, abc2);
+
+ b = v256_blend<0b00100100>(abc0, abc1);
+ b = v256_blend<0b10011001>(b, abc2);
+
+ c = v256_blend<0b01001001>(abc0, abc1);
+ c = v256_blend<0b00100010>(c, abc2);
+
+ a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
+ b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
+ c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
+}
+/////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+ _Tpvec ab0, ab1, cd0, cd1;
+ v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
+ v256_zip(ab0, ab1, a, b);
+ v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{
+ _Tpvec ac0, ac1, bd0, bd1;
+ v256_zip(a, c, ac0, ac1);
+ v256_zip(b, d, bd0, bd1);
+
+ _Tpvec abcd0, abcd1, abcd2, abcd3;
+ v256_zip(ac0, bd0, abcd0, abcd1);
+ v256_zip(ac1, bd1, abcd2, abcd3);
+
+ _Tpvec abcd01, abcd23, abcd45, abcd67;
+ v_recombine(abcd0, abcd1, abcd01, abcd45);
+ v_recombine(abcd2, abcd3, abcd23, abcd67);
+
+ v_store(ptr, abcd01);
+ v_store(ptr + _Tpvec::nlanes, abcd23);
+ v_store(ptr + _Tpvec::nlanes * 2, abcd45);
+ v_store(ptr + _Tpvec::nlanes * 3, abcd67);
+}
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8, unsigned, l8)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8, int, l8)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float, l8)
+
+/* ******** ******** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+ const __m256i sep = _mm256_setr_epi8(
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+ );
+
+ _Tpvec ab0, ab1;
+ v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+
+ __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
+ __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+
+ a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
+ b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+}
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+ v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
+ v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
+ v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
+ v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
+
+ v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0b10101010>(c, a));
+ cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
+
+ v_uint32x8 ac1ab1 = v256_blend<0b10101010>(ab1, bc1);
+ ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
+
+ v_uint32x8 abc001 = v256_blend<0b10101010>(ab0, cazg);
+ v_uint32x8 cabc0 = v256_blend<0b10101010>(cazg, bc0);
+
+ v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
+ v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
+
+ v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
+ v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
+ abc21 = v256_swap_halves(abc21);
+ v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
+
+ v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
+ v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
+ v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
+
+ v_store(ptr, _Tpvec(abc0.val));
+ v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
+ v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
+}
+// todo:
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
+{}
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+ _Tpvec ab0, ab1, cd0, cd1;
+ v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
+ v256_zip(ab0, ab1, a, b);
+ v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{ v256_store_interleave_l8(ptr, a, b, c, d); }
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16, ushort, l16)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16, short, l16)
+
+/* **************** **************** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+ const __m256i sep = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+
+ _Tpvec ab0, ab1;
+ v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+
+ __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
+ __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+
+ a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
+ b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+}
+
+/// todo
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
+{}
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
+{}
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+ const __m256i sep = _mm256_setr_epi8(
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+ );
+
+ _Tpvec abcd0, abcd1, abcd2, abcd3;
+ v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
+ v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
+
+ __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
+ __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
+ __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
+ __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
+
+ __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
+ __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
+ __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
+ __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
+
+ a.val = _mm256_unpacklo_epi64(ab0, ab1);
+ b.val = _mm256_unpackhi_epi64(ab0, ab1);
+ c.val = _mm256_unpacklo_epi64(cd0, cd1);
+ d.val = _mm256_unpackhi_epi64(cd0, cd1);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{ v256_store_interleave_l8(ptr, a, b, c, d); }
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32, uchar, l32)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32, schar, l32)
+
+inline void v256_cleanup() { _mm256_zeroupper(); }
+
+//! @name Check SIMD256 support
+//! @{
+//! @brief Check CPU capability of SIMD operation
+static inline bool hasSIMD256()
+{
+ return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false;
+}
+//! @}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
{
//! @cond IGNORED
typedef _Tp lane_type;
- typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
- typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
enum { nlanes = n };
// !@endcond
/** @brief Multiply and add
-Returns \f$ a*b + c \f$
-For floating point types and signed 32bit int only. */
+ Returns \f$ a*b + c \f$
+ For floating point types and signed 32bit int only. */
template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
- const v_reg<_Tp, n>& c)
+inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+ const v_reg<_Tp, n>& c)
{
v_reg<_Tp, n> d;
for( int i = 0; i < n; i++ )
return d;
}
+/** @brief A synonym for v_fma */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+ const v_reg<_Tp, n>& c)
+{
+ return v_fma(a, b, c);
+}
+
/** @brief Dot product of elements
Multiply values in two registers and sum adjacent result pairs.
@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
*/
template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
{
- return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+ return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
}
/** @brief Load register contents from memory (aligned)
similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
*/
template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
{
- return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+ return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
}
/** @brief Load 64-bits of data to lower part (high part is undefined).
@endcode
*/
template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
{
- v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+ v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for( int i = 0; i < c.nlanes/2; i++ )
{
c.s[i] = ptr[i];
@endcode
*/
template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
{
- v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+ v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for( int i = 0; i < c.nlanes/2; i++ )
{
c.s[i] = loptr[i];
@endcode
For 8-, 16-, 32-bit integer source types. */
template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
v_load_expand(const _Tp* ptr)
{
typedef typename V_TypeTraits<_Tp>::w_type w_type;
- v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
+ v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
for( int i = 0; i < c.nlanes; i++ )
{
c.s[i] = ptr[i];
@endcode
For 8-bit integer source types. */
template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
v_load_expand_q(const _Tp* ptr)
{
typedef typename V_TypeTraits<_Tp>::q_type q_type;
- v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
+ v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
for( int i = 0; i < c.nlanes; i++ )
{
c.s[i] = ptr[i];
return c;
}
+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+ v_reg<float, n*2> c;
+ for( int i = 0; i < n; i++ )
+ {
+ c.s[i] = (float)a.s[i];
+ c.s[i+n] = (float)b.s[i];
+ }
+ return c;
+}
+
/** @brief Convert to double
Supported input type is cv::v_int32x4. */
return c;
}
+template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
+{
+ v_reg<int, n> c;
+ for( int i = 0; i < n; i++ )
+ c.s[i] = tab[idx.s[i]];
+ return c;
+}
+
+template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
+{
+ v_reg<float, n> c;
+ for( int i = 0; i < n; i++ )
+ c.s[i] = tab[idx.s[i]];
+ return c;
+}
+
+template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+{
+ v_reg<double, n> c;
+ for( int i = 0; i < n; i++ )
+ c.s[i] = tab[idx.s[i]];
+ return c;
+}
+
+template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
+ v_reg<float, n>& x, v_reg<float, n>& y)
+{
+ for( int i = 0; i < n; i++ )
+ {
+ int j = idx.s[i];
+ x.s[i] = tab[j];
+ y.s[i] = tab[j+1];
+ }
+}
+
+template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
+ v_reg<double, n>& x, v_reg<double, n>& y)
+{
+ for( int i = 0; i < n; i++ )
+ {
+ int j = idx.s[i];
+ x.s[i] = tab[j];
+ y.s[i] = tab[j+1];
+ }
+}
+
/** @brief Transpose 4x4 matrix
Scheme:
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
}
+inline void v_cleanup() {}
+
//! @}
//! @name Check SIMD support
#if CV_FP16
// Workaround for old compilers
-template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
-{ return (int16x4_t)a; }
-template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
-{ return (float16x4_t)a; }
-template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
+static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
+static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
+static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
+static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
+
+static inline float16x8_t cv_vld1q_f16(const void* ptr)
+{
+#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
+ return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
+#else
+ return vld1q_f16((const __fp16*)ptr);
+#endif
+}
+static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
+{
+#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
+ vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
+#else
+ vst1q_f16((__fp16*)ptr, a);
+#endif
+}
+
+static inline float16x4_t cv_vld1_f16(const void* ptr)
{
#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
return vld1_f16((const __fp16*)ptr);
#endif
}
-template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
+static inline void cv_vst1_f16(void* ptr, float16x4_t a)
{
#ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
#endif
}
-struct v_float16x4
+
+struct v_float16x8
{
typedef short lane_type;
- enum { nlanes = 4 };
+ enum { nlanes = 8 };
- v_float16x4() {}
- explicit v_float16x4(float16x4_t v) : val(v) {}
- v_float16x4(short v0, short v1, short v2, short v3)
+ v_float16x8() {}
+ explicit v_float16x8(float16x8_t v) : val(v) {}
+ v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
{
- short v[] = {v0, v1, v2, v3};
- val = cv_vld1_f16(v);
+ short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+ val = cv_vld1q_f16(v);
}
short get0() const
{
- return vget_lane_s16(vreinterpret_s16_f16(val), 0);
+ return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
}
- float16x4_t val;
+ float16x8_t val;
};
+
+inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
+inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
#endif
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
}
-inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
{
+#if CV_SIMD128_64F
+ // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
+ // also adds FMA support both for single- and double-precision floating-point vectors
+ return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
+#else
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+#endif
}
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
}
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+ return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+ return v_fma(a, b, c);
+}
+
#if CV_SIMD128_64F
inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
{
return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
}
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+ return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
+}
+
inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
{
- return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val)));
+ return v_fma(a, b, c);
}
#endif
#if CV_FP16
// Workaround for old comiplers
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(cv_vld1_f16(ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ cv_vst1_f16(ptr, a.val); }
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
#endif
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
}
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+ return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
+}
+
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
{
return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
#endif
#if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
+{
+ return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
+}
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
{
- return v_float32x4(vcvt_f32_f16(a.val));
+ return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
}
-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
{
- return v_float16x4(vcvt_f16_f32(a.val));
+ return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
}
#endif
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) elems[4] =
+ {
+ tab[vgetq_lane_s32(idxvec.val, 0)],
+ tab[vgetq_lane_s32(idxvec.val, 1)],
+ tab[vgetq_lane_s32(idxvec.val, 2)],
+ tab[vgetq_lane_s32(idxvec.val, 3)]
+ };
+ return v_int32x4(vld1q_s32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+ float CV_DECL_ALIGNED(32) elems[4] =
+ {
+ tab[vgetq_lane_s32(idxvec.val, 0)],
+ tab[vgetq_lane_s32(idxvec.val, 1)],
+ tab[vgetq_lane_s32(idxvec.val, 2)],
+ tab[vgetq_lane_s32(idxvec.val, 3)]
+ };
+ return v_float32x4(vld1q_f32(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+ /*int CV_DECL_ALIGNED(32) idx[4];
+ v_store(idx, idxvec);
+
+ float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
+ float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
+
+ float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
+ x = v_float32x4(xxyy.val[0]);
+ y = v_float32x4(xxyy.val[1]);*/
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+
+ x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+ y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+ double CV_DECL_ALIGNED(32) elems[2] =
+ {
+ tab[vgetq_lane_s32(idxvec.val, 0)],
+ tab[vgetq_lane_s32(idxvec.val, 1)],
+ };
+ return v_float64x2(vld1q_f64(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+
+ x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+ y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+inline void v_cleanup() {}
+
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_float32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float64x2;
+
struct v_uint8x16
{
typedef uchar lane_type;
{
return (short)_mm_cvtsi128_si32(val);
}
+
__m128i val;
};
{
return (unsigned)_mm_cvtsi128_si32(val);
}
+
__m128i val;
};
{
return _mm_cvtsi128_si32(val);
}
+
__m128i val;
};
{
return _mm_cvtss_f32(val);
}
+
__m128 val;
};
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
return (unsigned)a | ((uint64)(unsigned)b << 32);
}
+
__m128i val;
};
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
}
+
__m128i val;
};
{
return _mm_cvtsd_f64(val);
}
+
__m128d val;
};
-#if CV_FP16
-struct v_float16x4
+struct v_float16x8
{
typedef short lane_type;
typedef __m128i vector_type;
- enum { nlanes = 4 };
+ enum { nlanes = 8 };
- v_float16x4() : val(_mm_setzero_si128()) {}
- explicit v_float16x4(__m128i v) : val(v) {}
- v_float16x4(short v0, short v1, short v2, short v3)
+ v_float16x8() : val(_mm_setzero_si128()) {}
+ explicit v_float16x8(__m128i v) : val(v) {}
+ v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
{
- val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
+ val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
}
short get0() const
{
return (short)_mm_cvtsi128_si32(val);
}
+
__m128i val;
};
-#endif
+inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
+inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
namespace hal_sse_internal
{
}
inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
{
+#if CV_SSE4_1
+ return v_int32x4(_mm_mullo_epi32(a.val, b.val));
+#else
__m128i c0 = _mm_mul_epu32(a.val, b.val);
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
__m128i d0 = _mm_unpacklo_epi32(c0, c1);
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+#endif
}
inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
{
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
}
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return a * b + c;
}
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+ return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_FMA3
+ return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
+#else
+ return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
+#endif
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+#if CV_FMA3
+ return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
+#else
+ return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
+#endif
+}
+
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
{ \
} \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
- _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
- return _Tpvec(_mm_sqrt_##suffix(res)); \
+ _Tpvec res = v_fma(a, a, b*b); \
+ return _Tpvec(_mm_sqrt_##suffix(res.val)); \
} \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ \
- _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
- return _Tpvec(res); \
+ return v_fma(a, a, b*b); \
} \
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
{ \
- return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
+ return v_fma(a, b, c); \
}
OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
-#if CV_FP16
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ _mm_storel_epi64((__m128i*)ptr, a.val); }
-#endif
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ _mm_storeu_si128((__m128i*)ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ _mm_store_si128((__m128i*)ptr, a.val); }
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
return v_float32x4(_mm_cvtpd_ps(a.val));
}
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+ return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
+}
+
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
{
return v_float64x2(_mm_cvtepi32_pd(a.val));
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
{
- return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
+ return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
}
#if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
{
return v_float32x4(_mm_cvtph_ps(a.val));
}
-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
+{
+ return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
+}
+
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
{
- return v_float16x4(_mm_cvtps_ph(a.val, 0));
+ return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
}
#endif
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+ int idx[2];
+ v_store_low(idx, idxvec);
+ return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+// x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+// y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ __m128 z = _mm_setzero_ps();
+ __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
+ __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
+ xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
+ xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
+ __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
+ __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
+ x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
+ y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+ int idx[2];
+ v_store_low(idx, idxvec);
+ __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+ __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+ x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
+ y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
+}
+
+inline void v_cleanup() {}
+
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
{ return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
+inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }
inline v_float32x4 v_cvt_f32(const v_float64x2& a)
{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
+
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+ y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+ int CV_DECL_ALIGNED(32) idx[4];
+ v_store_aligned(idx, idxvec);
+ x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+ y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+
+inline void v_cleanup() {}
+
+
/** Reinterpret **/
/** its up there with load and store operations **/
The method creates a full copy of the array. The original step[] is not taken into account. So, the
array copy is a continuous array occupying total()*elemSize() bytes.
*/
- Mat clone() const;
+ Mat clone() const CV_NODISCARD;
/** @brief Copies the matrix to another one.
Mat_ row(int y) const;
Mat_ col(int x) const;
Mat_ diag(int d=0) const;
- Mat_ clone() const;
+ Mat_ clone() const CV_NODISCARD;
//! overridden forms of Mat::elemSize() etc.
size_t elemSize() const;
static UMat diag(const UMat& d);
//! returns deep copy of the matrix, i.e. the data is copied
- UMat clone() const;
+ UMat clone() const CV_NODISCARD;
//! copies the matrix content to "m".
// It calls m.create(this->size(), this->type()).
void copyTo( OutputArray m ) const;
SparseMat& operator = (const Mat& m);
//! creates full copy of the matrix
- SparseMat clone() const;
+ SparseMat clone() const CV_NODISCARD;
//! copies all the data to the destination matrix. All the previous content of m is erased
void copyTo( SparseMat& m ) const;
SparseMat_& operator = (const Mat& m);
//! makes full copy of the matrix. All the elements are duplicated
- SparseMat_ clone() const;
+ SparseMat_ clone() const CV_NODISCARD;
//! equivalent to cv::SparseMat::create(dims, _sizes, DataType<_Tp>::type)
void create(int dims, const int* _sizes);
//! converts sparse matrix to the old-style CvSparseMat. All the elements are copied
////////////////////////////// Small Matrix ///////////////////////////
//! @cond IGNORED
-struct CV_EXPORTS Matx_AddOp {};
-struct CV_EXPORTS Matx_SubOp {};
-struct CV_EXPORTS Matx_ScaleOp {};
-struct CV_EXPORTS Matx_MulOp {};
-struct CV_EXPORTS Matx_DivOp {};
-struct CV_EXPORTS Matx_MatMulOp {};
-struct CV_EXPORTS Matx_TOp {};
+// FIXIT Remove this (especially CV_EXPORTS modifier)
+struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
+struct CV_EXPORTS Matx_SubOp { Matx_SubOp() {} Matx_SubOp(const Matx_SubOp&) {} };
+struct CV_EXPORTS Matx_ScaleOp { Matx_ScaleOp() {} Matx_ScaleOp(const Matx_ScaleOp&) {} };
+struct CV_EXPORTS Matx_MulOp { Matx_MulOp() {} Matx_MulOp(const Matx_MulOp&) {} };
+struct CV_EXPORTS Matx_DivOp { Matx_DivOp() {} Matx_DivOp(const Matx_DivOp&) {} };
+struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_MatMulOp&) {} };
+struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
//! @endcond
/** @brief Template class for small matrices whose type and size are known at compilation time
//! default constructor
Matx();
- Matx(_Tp v0); //!< 1x1 matrix
+ explicit Matx(_Tp v0); //!< 1x1 matrix
Matx(_Tp v0, _Tp v1); //!< 1x2 or 2x1 matrix
Matx(_Tp v0, _Tp v1, _Tp v2); //!< 1x3 or 3x1 matrix
Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 1x4, 2x2 or 4x1 matrix
template<typename _Tp, int m, int n> struct Matx_FastInvOp
{
- bool operator()(const Matx<_Tp, m, n>&, Matx<_Tp, n, m>&, int) const
+ bool operator()(const Matx<_Tp, m, n>& a, Matx<_Tp, n, m>& b, int method) const
{
- return false;
+ return invert(a, b, method) != 0;
}
};
{
bool operator()(const Matx<_Tp, m, m>& a, Matx<_Tp, m, m>& b, int method) const
{
- Matx<_Tp, m, m> temp = a;
+ if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+ {
+ Matx<_Tp, m, m> temp = a;
- // assume that b is all 0's on input => make it a unity matrix
- for( int i = 0; i < m; i++ )
- b(i, i) = (_Tp)1;
+ // assume that b is all 0's on input => make it a unity matrix
+ for (int i = 0; i < m; i++)
+ b(i, i) = (_Tp)1;
- if( method == DECOMP_CHOLESKY )
- return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+ if (method == DECOMP_CHOLESKY)
+ return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
- return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+ return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+ }
+ else
+ {
+ return invert(a, b, method) != 0;
+ }
}
};
template<typename _Tp> struct Matx_FastInvOp<_Tp, 2, 2>
{
- bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int) const
+ bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int /*method*/) const
{
_Tp d = (_Tp)determinant(a);
- if( d == 0 )
+ if (d == 0)
return false;
d = 1/d;
b(1,1) = a(0,0)*d;
template<typename _Tp> struct Matx_FastInvOp<_Tp, 3, 3>
{
- bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int) const
+ bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int /*method*/) const
{
_Tp d = (_Tp)determinant(a);
- if( d == 0 )
+ if (d == 0)
return false;
d = 1/d;
b(0,0) = (a(1,1) * a(2,2) - a(1,2) * a(2,1)) * d;
template<typename _Tp, int m, int l, int n> struct Matx_FastSolveOp
{
- bool operator()(const Matx<_Tp, m, l>&, const Matx<_Tp, m, n>&,
- Matx<_Tp, l, n>&, int) const
+ bool operator()(const Matx<_Tp, m, l>& a, const Matx<_Tp, m, n>& b,
+ Matx<_Tp, l, n>& x, int method) const
{
- return false;
+ return cv::solve(a, b, x, method);
}
};
bool operator()(const Matx<_Tp, m, m>& a, const Matx<_Tp, m, n>& b,
Matx<_Tp, m, n>& x, int method) const
{
- Matx<_Tp, m, m> temp = a;
- x = b;
- if( method == DECOMP_CHOLESKY )
- return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);
+ if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+ {
+ Matx<_Tp, m, m> temp = a;
+ x = b;
+ if( method == DECOMP_CHOLESKY )
+ return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);
- return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+ return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+ }
+ else
+ {
+ return cv::solve(a, b, x, method);
+ }
}
};
Matx<_Tp, 2, 1>& x, int) const
{
_Tp d = (_Tp)determinant(a);
- if( d == 0 )
+ if (d == 0)
return false;
d = 1/d;
x(0) = (b(0)*a(1,1) - b(1)*a(0,1))*d;
Matx<_Tp, 3, 1>& x, int) const
{
_Tp d = (_Tp)determinant(a);
- if( d == 0 )
+ if (d == 0)
return false;
d = 1/d;
x(0) = d*(b(0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1)) -
Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
{
Matx<_Tp, n, m> b;
- bool ok;
- if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
- {
- CV_Assert(m == n);
- ok = cv::internal::Matx_FastInvOp<_Tp, m, n>()(*this, b, method);
- }
- else
- {
- Mat A(*this, false), B(b, false);
- ok = (invert(A, B, method) != 0);
- }
- if( NULL != p_is_ok ) { *p_is_ok = ok; }
+ bool ok = cv::internal::Matx_FastInvOp<_Tp, m, n>()(*this, b, method);
+ if (p_is_ok) *p_is_ok = ok;
return ok ? b : Matx<_Tp, n, m>::zeros();
}
Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) const
{
Matx<_Tp, n, l> x;
- bool ok;
- if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
- {
- CV_Assert(m == n);
- ok = cv::internal::Matx_FastSolveOp<_Tp, m, n, l>()(*this, rhs, x, method);
- }
- else
- {
- Mat A(*this, false), B(rhs, false), X(x, false);
- ok = cv::solve(A, B, X, method);
- }
-
+ bool ok = cv::internal::Matx_FastSolveOp<_Tp, m, n, l>()(*this, rhs, x, method);
return ok ? x : Matx<_Tp, n, l>::zeros();
}
TEST_CYCLE()
{
- source.clone();
+ Mat tmp = source.clone();
+ (void)tmp;
}
destination = source.clone();
TEST_CYCLE()
{
- roi.clone();
+ Mat tmp = roi.clone();
+ (void)tmp;
}
destination = roi.clone();
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
{
float32x4_t v_src = vld1q_f32(src + x);
-
float16x4_t v_dst = vcvt_f16_f32(v_src);
- cv_vst1_f16((__fp16*)dst + x, v_dst);
+ cv_vst1_f16(dst + x, v_dst);
}
for ( ; x < size.width; x++ )
void log64f(const double *src, double *dst, int n);
float fastAtan2(float y, float x);
-
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
using namespace std;
using namespace cv;
-#if CV_SIMD128
-
-template <typename T>
-struct v_atan
+static inline float atan_f32(float y, float x)
{
- typedef V_RegTrait128<T> Trait;
- typedef typename Trait::reg VT; // vector type
- enum { WorkWidth = VT::nlanes * 2 };
-
- v_atan(const T & scale)
- : s(Trait::all(scale))
+ float ax = std::abs(x), ay = std::abs(y);
+ float a, c, c2;
+ if( ax >= ay )
{
- eps = Trait::all(DBL_EPSILON);
- z = Trait::zero();
- p7 = Trait::all(atan2_p7);
- p5 = Trait::all(atan2_p5);
- p3 = Trait::all(atan2_p3);
- p1 = Trait::all(atan2_p1);
- val90 = Trait::all(90.f);
- val180 = Trait::all(180.f);
- val360 = Trait::all(360.f);
+ c = ay/(ax + (float)DBL_EPSILON);
+ c2 = c*c;
+ a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
}
+ else
+ {
+ c = ax/(ay + (float)DBL_EPSILON);
+ c2 = c*c;
+ a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+ }
+ if( x < 0 )
+ a = 180.f - a;
+ if( y < 0 )
+ a = 360.f - a;
+ return a;
+}
- inline int operator()(int len, const T * Y, const T * X, T * angle)
+#if CV_SIMD
+
+struct v_atan_f32
+{
+ explicit v_atan_f32(const float& scale)
{
- int i = 0;
- const int c = VT::nlanes;
- for ( ; i <= len - c * 2; i += c * 2)
- {
- VT x1 = v_load(X + i);
- VT x2 = v_load(X + i + c);
- VT y1 = v_load(Y + i);
- VT y2 = v_load(Y + i + c);
- v_store(&angle[i], s * one(x1, y1));
- v_store(&angle[i + c], s * one(x2, y2));
- }
- return i;
+ eps = vx_setall_f32((float)DBL_EPSILON);
+ z = vx_setzero_f32();
+ p7 = vx_setall_f32(atan2_p7);
+ p5 = vx_setall_f32(atan2_p5);
+ p3 = vx_setall_f32(atan2_p3);
+ p1 = vx_setall_f32(atan2_p1);
+ val90 = vx_setall_f32(90.f);
+ val180 = vx_setall_f32(180.f);
+ val360 = vx_setall_f32(360.f);
+ s = vx_setall_f32(scale);
}
-private:
- inline VT one(VT & x, VT & y)
+ v_float32 compute(const v_float32& y, const v_float32& x)
{
- VT ax = v_abs(x);
- VT ay = v_abs(y);
- VT c = v_min(ax, ay) / (v_max(ax, ay) + eps);
- VT cc = c * c;
- VT a = (((p7 * cc + p5) * cc + p3) * cc + p1) * c;
+ v_float32 ax = v_abs(x);
+ v_float32 ay = v_abs(y);
+ v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
+ v_float32 cc = c * c;
+ v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
a = v_select(ax >= ay, a, val90 - a);
a = v_select(x < z, val180 - a, a);
a = v_select(y < z, val360 - a, a);
- return a;
+ return a * s;
}
-private:
- VT eps;
- VT z;
- VT p7;
- VT p5;
- VT p3;
- VT p1;
- VT val90;
- VT val180;
- VT val360;
- VT s;
+ v_float32 eps;
+ v_float32 z;
+ v_float32 p7;
+ v_float32 p5;
+ v_float32 p3;
+ v_float32 p1;
+ v_float32 val90;
+ v_float32 val180;
+ v_float32 val360;
+ v_float32 s;
};
-#if !CV_SIMD128_64F
+#endif
+
+} // anonymous::
+
+///////////////////////////////////// ATAN2 ////////////////////////////////////
-// emulation
-template <>
-struct v_atan<double>
+static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
- v_atan(double scale) : impl(static_cast<float>(scale)) {}
- inline int operator()(int len, const double * Y, const double * X, double * angle)
+ float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
+ int i = 0;
+#if CV_SIMD
+ const int VECSZ = v_float32::nlanes;
+ v_atan_f32 v(scale);
+
+ for( ; i < len; i += VECSZ*2 )
{
- int i = 0;
- const int c = v_atan<float>::WorkWidth;
- float bufY[c];
- float bufX[c];
- float bufA[c];
- for ( ; i <= len - c ; i += c)
+ if( i + VECSZ*2 > len )
{
- for (int j = 0; j < c; ++j)
- {
- bufY[j] = static_cast<float>(Y[i + j]);
- bufX[j] = static_cast<float>(X[i + j]);
- }
- impl(c, bufY, bufX, bufA);
- for (int j = 0; j < c; ++j)
- {
- angle[i + j] = bufA[j];
- }
+ // if it's inplace operation, we cannot repeatedly process
+ // the tail for the second time, so we have to use the
+ // scalar code
+ if( i == 0 || angle == X || angle == Y )
+ break;
+ i = len - VECSZ*2;
}
- return i;
- }
-private:
- v_atan<float> impl;
-};
-#endif
-
-#endif
-template <typename T>
-static inline T atanImpl(T y, T x)
-{
- T ax = std::abs(x), ay = std::abs(y);
- T a, c, c2;
- if( ax >= ay )
- {
- c = ay/(ax + static_cast<T>(DBL_EPSILON));
- c2 = c*c;
- a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
- }
- else
- {
- c = ax/(ay + static_cast<T>(DBL_EPSILON));
- c2 = c*c;
- a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
- }
- if( x < 0 )
- a = 180.f - a;
- if( y < 0 )
- a = 360.f - a;
- return a;
-}
+ v_float32 y0 = vx_load(Y + i);
+ v_float32 x0 = vx_load(X + i);
+ v_float32 y1 = vx_load(Y + i + VECSZ);
+ v_float32 x1 = vx_load(X + i + VECSZ);
-template <typename T>
-static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angleInDegrees)
-{
- int i = 0;
- T scale = angleInDegrees ? 1 : static_cast<T>(CV_PI/180);
+ v_float32 r0 = v.compute(y0, x0);
+ v_float32 r1 = v.compute(y1, x1);
-#if CV_SIMD128
- i = v_atan<T>(scale)(len, Y, X, angle);
+ v_store(angle + i, r0);
+ v_store(angle + i + VECSZ, r1);
+ }
+ vx_cleanup();
#endif
for( ; i < len; i++ )
- {
- angle[i] = atanImpl<T>(Y[i], X[i]) * scale;
- }
+ angle[i] = atan_f32(Y[i], X[i])*scale;
}
-} // anonymous::
-
-///////////////////////////////////// ATAN2 ////////////////////////////////////
-
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
CV_INSTRUMENT_REGION()
- atanImpl<float>(Y, X, angle, len, angleInDegrees);
+ fastAtan32f_(Y, X, angle, len, angleInDegrees );
}
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
{
CV_INSTRUMENT_REGION()
- atanImpl<double>(Y, X, angle, len, angleInDegrees);
+
+ const int BLKSZ = 128;
+ float ybuf[BLKSZ], xbuf[BLKSZ], abuf[BLKSZ];
+ for( int i = 0; i < len; i += BLKSZ )
+ {
+ int j, blksz = std::min(BLKSZ, len - i);
+ for( j = 0; j < blksz; j++ )
+ {
+ ybuf[j] = (float)Y[i + j];
+ xbuf[j] = (float)X[i + j];
+ }
+ fastAtan32f_(ybuf, xbuf, abuf, blksz, angleInDegrees);
+ for( j = 0; j < blksz; j++ )
+ angle[i + j] = abuf[j];
+ }
}
// deprecated
int i = 0;
-#if CV_SIMD128
- for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+ const int VECSZ = v_float32::nlanes;
+ for( ; i < len; i += VECSZ*2 )
{
- v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4);
- v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4);
+ if( i + VECSZ*2 > len )
+ {
+ if( i == 0 || mag == x || mag == y )
+ break;
+ i = len - VECSZ*2;
+ }
+ v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
+ v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
v_store(mag + i, x0);
- v_store(mag + i + 4, x1);
+ v_store(mag + i + VECSZ, x1);
}
+ vx_cleanup();
#endif
for( ; i < len; i++ )
int i = 0;
-#if CV_SIMD128_64F
- for( ; i <= len - 4; i += 4 )
+#if CV_SIMD_64F
+ const int VECSZ = v_float64::nlanes;
+ for( ; i < len; i += VECSZ*2 )
{
- v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
- v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2);
+ if( i + VECSZ*2 > len )
+ {
+ if( i == 0 || mag == x || mag == y )
+ break;
+ i = len - VECSZ*2;
+ }
+ v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
+ v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
v_store(mag + i, x0);
- v_store(mag + i + 2, x1);
+ v_store(mag + i + VECSZ, x1);
}
+ vx_cleanup();
#endif
for( ; i < len; i++ )
int i = 0;
-#if CV_SIMD128
- for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+ const int VECSZ = v_float32::nlanes;
+ for( ; i < len; i += VECSZ*2 )
{
- v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+ if( i + VECSZ*2 > len )
+ {
+ if( i == 0 || src == dst )
+ break;
+ i = len - VECSZ*2;
+ }
+ v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
t0 = v_invsqrt(t0);
t1 = v_invsqrt(t1);
- v_store(dst + i, t0); v_store(dst + i + 4, t1);
+ v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
}
+ vx_cleanup();
#endif
for( ; i < len; i++ )
void invSqrt64f(const double* src, double* dst, int len)
{
CV_INSTRUMENT_REGION()
-
int i = 0;
-#if CV_SSE2
- __m128d v_1 = _mm_set1_pd(1.0);
- for ( ; i <= len - 2; i += 2)
- _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
+#if CV_SIMD_64F
+ const int VECSZ = v_float64::nlanes;
+ for ( ; i < len; i += VECSZ*2)
+ {
+ if( i + VECSZ*2 > len )
+ {
+ if( i == 0 || src == dst )
+ break;
+ i = len - VECSZ*2;
+ }
+ v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
+ t0 = v_invsqrt(t0);
+ t1 = v_invsqrt(t1);
+ v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
+ }
#endif
for( ; i < len; i++ )
int i = 0;
-#if CV_SIMD128
- for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+ const int VECSZ = v_float32::nlanes;
+ for( ; i < len; i += VECSZ*2 )
{
- v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+ if( i + VECSZ*2 > len )
+ {
+ if( i == 0 || src == dst )
+ break;
+ i = len - VECSZ*2;
+ }
+ v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
t0 = v_sqrt(t0);
t1 = v_sqrt(t1);
- v_store(dst + i, t0); v_store(dst + i + 4, t1);
+ v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
}
+ vx_cleanup();
#endif
for( ; i < len; i++ )
int i = 0;
-#if CV_SIMD128_64F
- for( ; i <= len - 4; i += 4 )
+#if CV_SIMD_64F
+ const int VECSZ = v_float64::nlanes;
+ for( ; i < len; i += VECSZ*2 )
{
- v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);
+ if( i + VECSZ*2 > len )
+ {
+ if( i == 0 || src == dst )
+ break;
+ i = len - VECSZ*2;
+ }
+ v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
t0 = v_sqrt(t0);
t1 = v_sqrt(t1);
- v_store(dst + i, t0); v_store(dst + i + 2, t1);
+ v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
}
+ vx_cleanup();
#endif
for( ; i < len; i++ )
////////////////////////////////////// EXP /////////////////////////////////////
-typedef union
-{
- struct {
-#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
- int hi;
- int lo;
-#else
- int lo;
- int hi;
-#endif
- } i;
- double d;
-}
-DBLINT;
-
#define EXPTAB_SCALE 6
#define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1)
1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
};
+static float expTab_f[EXPTAB_MASK+1];
+static volatile bool extTab_f_initialized = false;
// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
#if (defined _MSC_VER && _MSC_VER < 1500) || \
{
CV_INSTRUMENT_REGION()
+ if( !extTab_f_initialized )
+ {
+ for( int j = 0; j <= EXPTAB_MASK; j++ )
+ expTab_f[j] = (float)expTab[j];
+ extTab_f_initialized = true;
+ }
+
static const float
A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
-#undef EXPPOLY
-#define EXPPOLY(x) \
-(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
-
int i = 0;
const Cv32suf* x = (const Cv32suf*)_x;
- Cv32suf buf[4];
-
-#if CV_AVX2
- if( n >= 8 )
+ float minval = (float)(-exp_max_val/exp_prescale);
+ float maxval = (float)(exp_max_val/exp_prescale);
+ float postscale = (float)exp_postscale;
+
+#if CV_SIMD
+ const int VECSZ = v_float32::nlanes;
+ static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+ static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+ static const v_float32 vminval = vx_setall_f32(minval);
+ static const v_float32 vmaxval = vx_setall_f32(maxval);
+
+ static const v_float32 vA1 = vx_setall_f32((float)A1);
+ static const v_float32 vA2 = vx_setall_f32((float)A2);
+ static const v_float32 vA3 = vx_setall_f32((float)A3);
+ static const v_float32 vA4 = vx_setall_f32((float)A4);
+
+ static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+ bool y_aligned = (size_t)(void*)y % 32 == 0;
+
+ for( ; i < n; i += VECSZ*2 )
{
- static const __m256d prescale4 = _mm256_set1_pd(exp_prescale);
- static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale);
- static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
- static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
- static const __m256 mA1 = _mm256_set1_ps(A1);
- static const __m256 mA2 = _mm256_set1_ps(A2);
- static const __m256 mA3 = _mm256_set1_ps(A3);
- static const __m256 mA4 = _mm256_set1_ps(A4);
- bool y_aligned = (size_t)(void*)y % 32 == 0;
-
- ushort CV_DECL_ALIGNED(32) tab_idx[16];
-
- for( ; i <= n - 8; i += 8 )
+ if( i + VECSZ*2 > n )
{
- __m128i xi0, xi1;
-
- __m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4));
- __m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4));
-
- xd0 = _mm256_mul_pd(xd0, prescale4);
- xd1 = _mm256_mul_pd(xd1, prescale4);
+ if( i == 0 || _x == y )
+ break;
+ i = n - VECSZ*2;
+ y_aligned = false;
+ }
- xi0 = _mm256_cvtpd_epi32(xd0);
- xi1 = _mm256_cvtpd_epi32(xd1);
+ v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
- xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0));
- xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1));
+ xf0 = v_min(v_max(xf0, vminval), vmaxval);
+ xf1 = v_min(v_max(xf1, vminval), vmaxval);
- // gcc does not support _mm256_set_m128
- //xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
- __m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1);
+ xf0 *= vprescale;
+ xf1 *= vprescale;
- xf = _mm256_mul_ps(xf, postscale8);
+ v_int32 xi0 = v_round(xf0);
+ v_int32 xi1 = v_round(xf1);
+ xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
+ xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
- xi0 = _mm_packs_epi32(xi0, xi1);
+ v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
+ v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
- _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
+ v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
+ xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
+ xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
- xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
- xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
- xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
- xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
- xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+ yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
+ yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
- __m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]);
- __m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]);
+ v_float32 zf0 = xf0 + vA1;
+ v_float32 zf1 = xf1 + vA1;
- // gcc does not support _mm256_set_m128
- //__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
- __m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1);
+ zf0 = v_fma(zf0, xf0, vA2);
+ zf1 = v_fma(zf1, xf1, vA2);
- //_mm256_set_m128i(xi1, xi0)
- __m256i temp = _mm256_castps_si256(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(xi0)), _mm_castsi128_ps(xi1), 1));
+ zf0 = v_fma(zf0, xf0, vA3);
+ zf1 = v_fma(zf1, xf1, vA3);
- yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23)));
+ zf0 = v_fma(zf0, xf0, vA4);
+ zf1 = v_fma(zf1, xf1, vA4);
- __m256 zf = _mm256_add_ps(xf, mA1);
+ zf0 *= yf0;
+ zf1 *= yf1;
-#if CV_FMA3
- zf = _mm256_fmadd_ps(zf, xf, mA2);
- zf = _mm256_fmadd_ps(zf, xf, mA3);
- zf = _mm256_fmadd_ps(zf, xf, mA4);
-#else
- zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2);
- zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3);
- zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4);
-#endif
- zf = _mm256_mul_ps(zf, yf);
-
- if( y_aligned )
- {
- _mm256_store_ps(y + i, zf);
- }
- else
- {
- _mm256_storeu_ps(y + i, zf);
- }
+ if( y_aligned )
+ {
+ v_store_aligned(y + i, zf0);
+ v_store_aligned(y + i + VECSZ, zf1);
}
- }
-#elif CV_SSE2
- if( n >= 8 )
- {
- static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
- static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
- static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
- static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
- static const __m128 mA1 = _mm_set1_ps(A1);
- static const __m128 mA2 = _mm_set1_ps(A2);
- static const __m128 mA3 = _mm_set1_ps(A3);
- static const __m128 mA4 = _mm_set1_ps(A4);
- bool y_aligned = (size_t)(void*)y % 16 == 0;
-
- ushort CV_DECL_ALIGNED(16) tab_idx[8];
-
- for( ; i <= n - 8; i += 8 )
+ else
{
- __m128 xf0, xf1;
- xf0 = _mm_loadu_ps(&x[i].f);
- xf1 = _mm_loadu_ps(&x[i+4].f);
- __m128i xi0, xi1, xi2, xi3;
-
- xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
- xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
-
- __m128d xd0 = _mm_cvtps_pd(xf0);
- __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
- __m128d xd1 = _mm_cvtps_pd(xf1);
- __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
-
- xd0 = _mm_mul_pd(xd0, prescale2);
- xd2 = _mm_mul_pd(xd2, prescale2);
- xd1 = _mm_mul_pd(xd1, prescale2);
- xd3 = _mm_mul_pd(xd3, prescale2);
-
- xi0 = _mm_cvtpd_epi32(xd0);
- xi2 = _mm_cvtpd_epi32(xd2);
-
- xi1 = _mm_cvtpd_epi32(xd1);
- xi3 = _mm_cvtpd_epi32(xd3);
-
- xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
- xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
- xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
- xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
-
- xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
- xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
-
- xf0 = _mm_mul_ps(xf0, postscale4);
- xf1 = _mm_mul_ps(xf1, postscale4);
-
- xi0 = _mm_unpacklo_epi64(xi0, xi2);
- xi1 = _mm_unpacklo_epi64(xi1, xi3);
- xi0 = _mm_packs_epi32(xi0, xi1);
-
- _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
-
- xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
- xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
- xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
- xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
- xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-
- __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
- __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
- __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
- __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
-
- __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
- __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
-
- yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
- yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
-
- __m128 zf0 = _mm_add_ps(xf0, mA1);
- __m128 zf1 = _mm_add_ps(xf1, mA1);
-
- zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
- zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
-
- zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
- zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
-
- zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
- zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
-
- zf0 = _mm_mul_ps(zf0, yf0);
- zf1 = _mm_mul_ps(zf1, yf1);
-
- if( y_aligned )
- {
- _mm_store_ps(y + i, zf0);
- _mm_store_ps(y + i + 4, zf1);
- }
- else
- {
- _mm_storeu_ps(y + i, zf0);
- _mm_storeu_ps(y + i + 4, zf1);
- }
+ v_store(y + i, zf0);
+ v_store(y + i + VECSZ, zf1);
}
}
- else
+ vx_cleanup();
#endif
- for( ; i <= n - 4; i += 4 )
- {
- double x0 = x[i].f * exp_prescale;
- double x1 = x[i + 1].f * exp_prescale;
- double x2 = x[i + 2].f * exp_prescale;
- double x3 = x[i + 3].f * exp_prescale;
- int val0, val1, val2, val3, t;
-
- if( ((x[i].i >> 23) & 255) > 127 + 10 )
- x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
- if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
- x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
-
- if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
- x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
-
- if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
- x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
-
- val0 = cvRound(x0);
- val1 = cvRound(x1);
- val2 = cvRound(x2);
- val3 = cvRound(x3);
-
- x0 = (x0 - val0)*exp_postscale;
- x1 = (x1 - val1)*exp_postscale;
- x2 = (x2 - val2)*exp_postscale;
- x3 = (x3 - val3)*exp_postscale;
-
- t = (val0 >> EXPTAB_SCALE) + 127;
- t = !(t & ~255) ? t : t < 0 ? 0 : 255;
- buf[0].i = t << 23;
-
- t = (val1 >> EXPTAB_SCALE) + 127;
- t = !(t & ~255) ? t : t < 0 ? 0 : 255;
- buf[1].i = t << 23;
-
- t = (val2 >> EXPTAB_SCALE) + 127;
- t = !(t & ~255) ? t : t < 0 ? 0 : 255;
- buf[2].i = t << 23;
-
- t = (val3 >> EXPTAB_SCALE) + 127;
- t = !(t & ~255) ? t : t < 0 ? 0 : 255;
- buf[3].i = t << 23;
-
- x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
- x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
- y[i] = (float)x0;
- y[i + 1] = (float)x1;
-
- x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
- x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
- y[i + 2] = (float)x2;
- y[i + 3] = (float)x3;
- }
for( ; i < n; i++ )
{
- double x0 = x[i].f * exp_prescale;
- int val0, t;
+ float x0 = x[i].f;
+ x0 = std::min(std::max(x0, minval), maxval);
+ x0 *= (float)exp_prescale;
+ Cv32suf buf;
- if( ((x[i].i >> 23) & 255) > 127 + 10 )
- x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
+ int xi = saturate_cast<int>(x0);
+ x0 = (x0 - xi)*postscale;
- val0 = cvRound(x0);
- t = (val0 >> EXPTAB_SCALE) + 127;
+ int t = (xi >> EXPTAB_SCALE) + 127;
t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+ buf.i = t << 23;
- buf[0].i = t << 23;
- x0 = (x0 - val0)*exp_postscale;
-
- y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
+ y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);
}
}
A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
-#undef EXPPOLY
-#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
-
int i = 0;
- Cv64suf buf[4];
const Cv64suf* x = (const Cv64suf*)_x;
-
-#if CV_SSE2
- static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
- static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
- static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
- static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
-
- static const __m128d mA0 = _mm_set1_pd(A0);
- static const __m128d mA1 = _mm_set1_pd(A1);
- static const __m128d mA2 = _mm_set1_pd(A2);
- static const __m128d mA3 = _mm_set1_pd(A3);
- static const __m128d mA4 = _mm_set1_pd(A4);
- static const __m128d mA5 = _mm_set1_pd(A5);
-
- int CV_DECL_ALIGNED(16) tab_idx[4];
-
- for( ; i <= n - 4; i += 4 )
+ double minval = (-exp_max_val/exp_prescale);
+ double maxval = (exp_max_val/exp_prescale);
+
+#if CV_SIMD_64F
+ const int VECSZ = v_float64::nlanes;
+ static const v_float64 vprescale = vx_setall_f64(exp_prescale);
+ static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+ static const v_float64 vminval = vx_setall_f64(minval);
+ static const v_float64 vmaxval = vx_setall_f64(maxval);
+
+ static const v_float64 vA1 = vx_setall_f64(A1);
+ static const v_float64 vA2 = vx_setall_f64(A2);
+ static const v_float64 vA3 = vx_setall_f64(A3);
+ static const v_float64 vA4 = vx_setall_f64(A4);
+ static const v_float64 vA5 = vx_setall_f64(A5);
+
+ static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+ bool y_aligned = (size_t)(void*)y % 32 == 0;
+
+ for( ; i < n; i += VECSZ*2 )
{
- __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
- __m128i xi0, xi1;
- xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
- xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
- xf0 = _mm_mul_pd(xf0, prescale2);
- xf1 = _mm_mul_pd(xf1, prescale2);
-
- xi0 = _mm_cvtpd_epi32(xf0);
- xi1 = _mm_cvtpd_epi32(xf1);
- xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
- xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
-
- xi0 = _mm_unpacklo_epi64(xi0, xi1);
- _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
-
- xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
- xi0 = _mm_packs_epi32(xi0, xi0);
- xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
- xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
- xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
- xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
- xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
-
- __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
- __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
- yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
- yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
-
- __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
- __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
-
- zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
- zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
-
- zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
- zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
-
- zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
- zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
-
- zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
- zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
-
- zf0 = _mm_mul_pd(zf0, yf0);
- zf1 = _mm_mul_pd(zf1, yf1);
-
- _mm_storeu_pd(y + i, zf0);
- _mm_storeu_pd(y + i + 2, zf1);
- }
-#endif
- for( ; i <= n - 4; i += 4 )
- {
- double x0 = x[i].f * exp_prescale;
- double x1 = x[i + 1].f * exp_prescale;
- double x2 = x[i + 2].f * exp_prescale;
- double x3 = x[i + 3].f * exp_prescale;
+ if( i + VECSZ*2 > n )
+ {
+ if( i == 0 || _x == y )
+ break;
+ i = n - VECSZ*2;
+ y_aligned = false;
+ }
- double y0, y1, y2, y3;
- int val0, val1, val2, val3, t;
+ v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
- t = (int)(x[i].i >> 52);
- if( (t & 2047) > 1023 + 10 )
- x0 = t < 0 ? -exp_max_val : exp_max_val;
+ xf0 = v_min(v_max(xf0, vminval), vmaxval);
+ xf1 = v_min(v_max(xf1, vminval), vmaxval);
- t = (int)(x[i+1].i >> 52);
- if( (t & 2047) > 1023 + 10 )
- x1 = t < 0 ? -exp_max_val : exp_max_val;
+ xf0 *= vprescale;
+ xf1 *= vprescale;
- t = (int)(x[i+2].i >> 52);
- if( (t & 2047) > 1023 + 10 )
- x2 = t < 0 ? -exp_max_val : exp_max_val;
+ v_int32 xi0 = v_round(xf0);
+ v_int32 xi1 = v_round(xf1);
+ xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
+ xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
- t = (int)(x[i+3].i >> 52);
- if( (t & 2047) > 1023 + 10 )
- x3 = t < 0 ? -exp_max_val : exp_max_val;
+ v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
+ v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
- val0 = cvRound(x0);
- val1 = cvRound(x1);
- val2 = cvRound(x2);
- val3 = cvRound(x3);
+ v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
+ xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
+ xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
- x0 = (x0 - val0)*exp_postscale;
- x1 = (x1 - val1)*exp_postscale;
- x2 = (x2 - val2)*exp_postscale;
- x3 = (x3 - val3)*exp_postscale;
+ v_int64 xq0, xq1, dummy;
+ v_expand(xi0, xq0, dummy);
+ v_expand(xi1, xq1, dummy);
- t = (val0 >> EXPTAB_SCALE) + 1023;
- t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
- buf[0].i = (int64)t << 52;
+ yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
+ yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
- t = (val1 >> EXPTAB_SCALE) + 1023;
- t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
- buf[1].i = (int64)t << 52;
+ v_float64 zf0 = xf0 + vA1;
+ v_float64 zf1 = xf1 + vA1;
- t = (val2 >> EXPTAB_SCALE) + 1023;
- t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
- buf[2].i = (int64)t << 52;
+ zf0 = v_fma(zf0, xf0, vA2);
+ zf1 = v_fma(zf1, xf1, vA2);
- t = (val3 >> EXPTAB_SCALE) + 1023;
- t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
- buf[3].i = (int64)t << 52;
+ zf0 = v_fma(zf0, xf0, vA3);
+ zf1 = v_fma(zf1, xf1, vA3);
- y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
- y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+ zf0 = v_fma(zf0, xf0, vA4);
+ zf1 = v_fma(zf1, xf1, vA4);
- y[i] = y0;
- y[i + 1] = y1;
+ zf0 = v_fma(zf0, xf0, vA5);
+ zf1 = v_fma(zf1, xf1, vA5);
- y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
- y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+ zf0 *= yf0;
+ zf1 *= yf1;
- y[i + 2] = y2;
- y[i + 3] = y3;
+ if( y_aligned )
+ {
+ v_store_aligned(y + i, zf0);
+ v_store_aligned(y + i + VECSZ, zf1);
+ }
+ else
+ {
+ v_store(y + i, zf0);
+ v_store(y + i + VECSZ, zf1);
+ }
}
+ vx_cleanup();
+#endif
for( ; i < n; i++ )
{
- double x0 = x[i].f * exp_prescale;
- int val0, t;
+ double x0 = x[i].f;
+ x0 = std::min(std::max(x0, minval), maxval);
+ x0 *= exp_prescale;
+ Cv64suf buf;
- t = (int)(x[i].i >> 52);
- if( (t & 2047) > 1023 + 10 )
- x0 = t < 0 ? -exp_max_val : exp_max_val;
+ int xi = saturate_cast<int>(x0);
+ x0 = (x0 - xi)*exp_postscale;
- val0 = cvRound(x0);
- t = (val0 >> EXPTAB_SCALE) + 1023;
+ int t = (xi >> EXPTAB_SCALE) + 1023;
t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+ buf.i = (int64)t << 52;
- buf[0].i = (int64)t << 52;
- x0 = (x0 - val0)*exp_postscale;
-
- y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+ y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5);
}
}
/////////////////////////////////////////// LOG ///////////////////////////////////////
-#define LOGTAB_SCALE 8
+#define LOGTAB_SCALE 8
#define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1)
-#define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1)
-#define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1)
-static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
+static const double CV_DECL_ALIGNED(16) logTab[] = {
0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000,
.00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420,
.00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760,
.69314718055994530941723212145818, 5.0e-01,
};
+static float logTab_f[(LOGTAB_MASK+1)*2];
+static volatile bool logTab_f_initialized = false;
-
-#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
+#define LOGTAB_TRANSLATE(tab, x, h) (((x) - 1.f)*tab[(h)+1])
static const double ln_2 = 0.69314718055994530941723212145818;
void log32f( const float *_x, float *y, int n )
{
CV_INSTRUMENT_REGION()
- static const float shift[] = { 0, -1.f/512 };
+ if( !logTab_f_initialized )
+ {
+ for( int j = 0; j < (LOGTAB_MASK+1)*2; j++ )
+ logTab_f[j] = (float)logTab[j];
+ logTab_f_initialized = true;
+ }
+
+ static const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1;
static const float
A0 = 0.3333333333333333333333333f,
A1 = -0.5f,
A2 = 1.f;
-#undef LOGPOLY
-#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
-
int i = 0;
- Cv32suf buf[4];
const int* x = (const int*)_x;
-#if CV_SSE2
- static const __m128d ln2_2 = _mm_set1_pd(ln_2);
- static const __m128 _1_4 = _mm_set1_ps(1.f);
- static const __m128 shift4 = _mm_set1_ps(-1.f/512);
-
- static const __m128 mA0 = _mm_set1_ps(A0);
- static const __m128 mA1 = _mm_set1_ps(A1);
- static const __m128 mA2 = _mm_set1_ps(A2);
+#if CV_SIMD
+ const int VECSZ = v_float32::nlanes;
+ static const v_float32 vln2 = vx_setall_f32((float)ln_2);
+ static const v_float32 v1 = vx_setall_f32(1.f);
+ static const v_float32 vshift = vx_setall_f32(-1.f/512);
- int CV_DECL_ALIGNED(16) idx[4];
+ static const v_float32 vA0 = vx_setall_f32(A0);
+ static const v_float32 vA1 = vx_setall_f32(A1);
+ static const v_float32 vA2 = vx_setall_f32(A2);
- for( ; i <= n - 4; i += 4 )
+ for( ; i < n; i += VECSZ )
{
- __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
- __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
- __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
- __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
-
- __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
-
- h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
- _mm_store_si128((__m128i*)idx, h0);
- h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
- __m128d t0, t1, t2, t3, t4;
- t0 = _mm_load_pd(icvLogTab + idx[0]);
- t2 = _mm_load_pd(icvLogTab + idx[1]);
- t1 = _mm_unpackhi_pd(t0, t2);
- t0 = _mm_unpacklo_pd(t0, t2);
- t2 = _mm_load_pd(icvLogTab + idx[2]);
- t4 = _mm_load_pd(icvLogTab + idx[3]);
- t3 = _mm_unpackhi_pd(t2, t4);
- t2 = _mm_unpacklo_pd(t2, t4);
-
- yd0 = _mm_add_pd(yd0, t0);
- yd1 = _mm_add_pd(yd1, t2);
-
- __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-
- __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
- xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
- xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
-
- __m128 zf0 = _mm_mul_ps(xf0, mA0);
- zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
- zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
- yf0 = _mm_add_ps(yf0, zf0);
-
- _mm_storeu_ps(y + i, yf0);
- }
-#endif
- for( ; i <= n - 4; i += 4 )
- {
- double x0, x1, x2, x3;
- double y0, y1, y2, y3;
- int h0, h1, h2, h3;
-
- h0 = x[i];
- h1 = x[i+1];
- buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
- buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
-
- y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
- y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
-
- h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
- h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
- y0 += icvLogTab[h0];
- y1 += icvLogTab[h1];
-
- h2 = x[i+2];
- h3 = x[i+3];
-
- x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
- x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
-
- buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
- buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
-
- y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
- y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
+ if( i + VECSZ > n )
+ {
+ if( i == 0 || _x == y )
+ break;
+ i = n - VECSZ;
+ }
- h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
- h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+ v_int32 h0 = vx_load(x + i);
+ v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
+ v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
- y2 += icvLogTab[h2];
- y3 += icvLogTab[h3];
+ h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
+ v_float32 yf0, xf0;
- x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
- x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
+ v_lut_deinterleave(logTab_f, h0, yf0, xf0);
- x0 += shift[h0 == 510];
- x1 += shift[h1 == 510];
- y0 += LOGPOLY( x0 );
- y1 += LOGPOLY( x1 );
+ yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
- y[i] = (float) y0;
- y[i + 1] = (float) y1;
+ v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;
+ xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
- x2 += shift[h2 == 510];
- x3 += shift[h3 == 510];
- y2 += LOGPOLY( x2 );
- y3 += LOGPOLY( x3 );
+ v_float32 zf0 = v_fma(xf0, vA0, vA1);
+ zf0 = v_fma(zf0, xf0, vA2);
+ zf0 = v_fma(zf0, xf0, yf0);
- y[i + 2] = (float) y2;
- y[i + 3] = (float) y3;
+ v_store(y + i, zf0);
}
+ vx_cleanup();
+#endif
for( ; i < n; i++ )
{
- int h0 = x[i];
- double y0;
- float x0;
-
- y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
+ Cv32suf buf;
+ int i0 = x[i];
- buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
- h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+ buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23);
+ int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
- y0 += icvLogTab[h0];
- x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
- x0 += shift[h0 == 510];
- y0 += LOGPOLY( x0 );
-
- y[i] = (float)y0;
+ float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx];
+ float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f);
+ y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0;
}
}
{
CV_INSTRUMENT_REGION()
- static const double shift[] = { 0, -1./512 };
+ static const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1;
static const double
A7 = 1.0,
A6 = -0.5,
A1 = 0.1428571428571428769682682968777953647077083587646484375,
A0 = -0.125;
-#undef LOGPOLY
-#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
-(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
-(((A1*xq + A3)*xq + A5)*xq + A7)*(x))
-
int i = 0;
- DBLINT buf[4];
- DBLINT *X = (DBLINT *) x;
-#if CV_SSE2
- static const __m128d ln2_2 = _mm_set1_pd(ln_2);
- static const __m128d _1_2 = _mm_set1_pd(1.);
- static const __m128d shift2 = _mm_set1_pd(-1./512);
+#if CV_SIMD_64F
+ const int VECSZ = v_float64::nlanes;
+ static const v_float64 vln2 = vx_setall_f64(ln_2);
- static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
- static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
+ static const v_float64
+ vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
+ vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
+ vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
+ vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7);
- static const __m128d mA0 = _mm_set1_pd(A0);
- static const __m128d mA1 = _mm_set1_pd(A1);
- static const __m128d mA2 = _mm_set1_pd(A2);
- static const __m128d mA3 = _mm_set1_pd(A3);
- static const __m128d mA4 = _mm_set1_pd(A4);
- static const __m128d mA5 = _mm_set1_pd(A5);
- static const __m128d mA6 = _mm_set1_pd(A6);
- static const __m128d mA7 = _mm_set1_pd(A7);
-
- int CV_DECL_ALIGNED(16) idx[4];
-
- for( ; i <= n - 4; i += 4 )
+ for( ; i < n; i += VECSZ )
{
- __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
- __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
-
- __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
- __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
-
- h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
-
- __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
- _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
- __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
- __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
-
- h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
- _mm_store_si128((__m128i*)idx, h0);
- h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
- __m128d t0, t1, t2, t3, t4;
- t0 = _mm_load_pd(icvLogTab + idx[0]);
- t2 = _mm_load_pd(icvLogTab + idx[1]);
- t1 = _mm_unpackhi_pd(t0, t2);
- t0 = _mm_unpacklo_pd(t0, t2);
- t2 = _mm_load_pd(icvLogTab + idx[2]);
- t4 = _mm_load_pd(icvLogTab + idx[3]);
- t3 = _mm_unpackhi_pd(t2, t4);
- t2 = _mm_unpacklo_pd(t2, t4);
-
- yd0 = _mm_add_pd(yd0, t0);
- yd1 = _mm_add_pd(yd1, t2);
-
- xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
- xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
-
- xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
- xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
-
- __m128d zd0 = _mm_mul_pd(xd0, mA0);
- __m128d zd1 = _mm_mul_pd(xd1, mA0);
- zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
- zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
- zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
- zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
- zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
- zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
- zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
- zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
- zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
- zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
- zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
- zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
- zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
- zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
-
- yd0 = _mm_add_pd(yd0, zd0);
- yd1 = _mm_add_pd(yd1, zd1);
-
- _mm_storeu_pd(y + i, yd0);
- _mm_storeu_pd(y + i + 2, yd1);
- }
-#endif
- for( ; i <= n - 4; i += 4 )
- {
- double xq;
- double x0, x1, x2, x3;
- double y0, y1, y2, y3;
- int h0, h1, h2, h3;
-
- h0 = X[i].i.lo;
- h1 = X[i + 1].i.lo;
- buf[0].i.lo = h0;
- buf[1].i.lo = h1;
-
- h0 = X[i].i.hi;
- h1 = X[i + 1].i.hi;
- buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
- buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
-
- y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
- y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
-
- h2 = X[i + 2].i.lo;
- h3 = X[i + 3].i.lo;
- buf[2].i.lo = h2;
- buf[3].i.lo = h3;
-
- h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
- h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
- y0 += icvLogTab[h0];
- y1 += icvLogTab[h1];
-
- h2 = X[i + 2].i.hi;
- h3 = X[i + 3].i.hi;
-
- x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
- x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
-
- buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
- buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
-
- y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
- y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
-
- h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
- h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+ if( i + VECSZ > n )
+ {
+ if( i == 0 || x == y )
+ break;
+ i = n - VECSZ;
+ }
- y2 += icvLogTab[h2];
- y3 += icvLogTab[h3];
+ v_int64 h0 = vx_load((const int64*)x + i);
+ v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
+ yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
- x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
- x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
+ v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
+ h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
+ v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
- y0 += LOGPOLY( x0, h0 == 510 );
- y1 += LOGPOLY( x1, h1 == 510 );
+ v_float64 xf0, yf0;
+ v_lut_deinterleave(logTab, idx, yf0, xf0);
- y[i] = y0;
- y[i + 1] = y1;
+ yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
+ v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
+ xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
- y2 += LOGPOLY( x2, h2 == 510 );
- y3 += LOGPOLY( x3, h3 == 510 );
+ v_float64 xq = xf0*xf0;
+ v_float64 zf0 = v_fma(xq, vA0, vA2);
+ v_float64 zf1 = v_fma(xq, vA1, vA3);
+ zf0 = v_fma(zf0, xq, vA4);
+ zf1 = v_fma(zf1, xq, vA5);
+ zf0 = v_fma(zf0, xq, vA6);
+ zf1 = v_fma(zf1, xq, vA7);
+ zf1 = v_fma(zf1, xf0, yf0);
+ zf0 = v_fma(zf0, xq, zf1);
- y[i + 2] = y2;
- y[i + 3] = y3;
+ v_store(y + i, zf0);
}
+#endif
for( ; i < n; i++ )
{
- int h0 = X[i].i.hi;
- double xq;
- double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-
- buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
- buf[0].i.lo = X[i].i.lo;
- h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
- y0 += icvLogTab[h0];
- x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
- y0 += LOGPOLY( x0, h0 == 510 );
- y[i] = y0;
+ Cv64suf buf;
+ int64 i0 = ((const int64*)x)[i];
+
+ buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52);
+ int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
+
+ double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx];
+ double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.);
+
+ double xq = x0*x0;
+ y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0;
}
}
float fastAtan2( float y, float x )
{
- return atanImpl<float>(y, x);
+ return atan_f32(y, x);
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
}
#endif
-TEST(hal_intrin,float16x4)
+TEST(hal_intrin,float16)
{
- CV_CPU_CALL_FP16_(test_hal_intrin_float16x4, ());
+ CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
throw SkipTestException("Unsupported hardware: FP16 is not available");
}
namespace opencv_test { namespace hal {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-void test_hal_intrin_float16x4()
+void test_hal_intrin_float16()
{
- TheTest<v_float16x4>()
+ TheTest<v_float16x8>()
.test_loadstore_fp16()
.test_float_cvt_fp16()
;
namespace opencv_test { namespace hal {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-void test_hal_intrin_float16x4();
+void test_hal_intrin_float16();
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
template <typename R> struct Data
{
typedef typename R::lane_type LaneType;
+ typedef typename V_TypeTraits<LaneType>::int_type int_type;
+
Data()
{
for (int i = 0; i < R::nlanes; ++i)
CV_Assert(i >= 0 && i < R::nlanes);
return d[i];
}
+ int_type as_int(int i) const
+ {
+ CV_Assert(i >= 0 && i < R::nlanes);
+ union
+ {
+ LaneType l;
+ int_type i;
+ } v;
+ v.l = d[i];
+ return v.i;
+ }
const LaneType * mid() const
{
return d + R::nlanes / 2;
EXPECT_EQ(d, res);
// zero, all
- Data<R> resZ = V_RegTrait128<LaneType>::zero();
- Data<R> resV = V_RegTrait128<LaneType>::all(8);
+ Data<R> resZ, resV;
+ resZ.fill((LaneType)0);
+ resV.fill((LaneType)8);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ((LaneType)0, resZ[i]);
// v_expand and v_load_expand
TheTest & test_expand()
{
- typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+ typedef typename V_RegTraits<R>::w_reg Rx2;
Data<R> dataA;
R a = dataA;
TheTest & test_expand_q()
{
- typedef typename V_RegTrait128<LaneType>::q_reg Rx4;
+ typedef typename V_RegTraits<R>::q_reg Rx4;
Data<R> data;
Data<Rx4> out = v_load_expand_q(data.d);
const int n = Rx4::nlanes;
TheTest & test_mul_expand()
{
- typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+ typedef typename V_RegTraits<R>::w_reg Rx2;
Data<R> dataA, dataB(2);
R a = dataA, b = dataB;
Rx2 c, d;
TheTest & test_abs()
{
- typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+ typedef typename V_RegTraits<R>::u_reg Ru;
typedef typename Ru::lane_type u_type;
Data<R> dataA, dataB(10);
R a = dataA, b = dataB;
TheTest & test_dot_prod()
{
- typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+ typedef typename V_RegTraits<R>::w_reg Rx2;
typedef typename Rx2::lane_type w_type;
Data<R> dataA, dataB(2);
TheTest & test_absdiff()
{
- typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+ typedef typename V_RegTraits<R>::u_reg Ru;
typedef typename Ru::lane_type u_type;
Data<R> dataA(std::numeric_limits<LaneType>::max()),
dataB(std::numeric_limits<LaneType>::min());
TheTest & test_mask()
{
- typedef V_TypeTraits<LaneType> Traits;
- typedef typename Traits::int_type int_type;
+ typedef typename V_RegTraits<R>::int_reg int_reg;
+ typedef typename V_RegTraits<int_reg>::u_reg uint_reg;
+ typedef typename int_reg::lane_type int_type;
+ typedef typename uint_reg::lane_type uint_type;
Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
dataA[1] *= (LaneType)-1;
- const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0));
+ union
+ {
+ LaneType l;
+ uint_type ui;
+ }
+ all1s;
+ all1s.ui = (uint_type)-1;
+ LaneType mask_one = all1s.l;
dataB[1] = mask_one;
dataB[R::nlanes / 2] = mask_one;
dataB[R::nlanes - 1] = mask_one;
Data<R> resF = f;
for (int i = 0; i < R::nlanes; ++i)
{
- int_type m2 = Traits::reinterpret_int(dataB[i]);
- EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
- | (Traits::reinterpret_int(dataE[i]) & ~m2),
- Traits::reinterpret_int(resF[i]));
+ int_type m2 = dataB.as_int(i);
+ EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i));
}
return *this;
TheTest & test_pack()
{
SCOPED_TRACE(s);
- typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+ typedef typename V_RegTraits<R>::w_reg Rx2;
typedef typename Rx2::lane_type w_type;
Data<Rx2> dataA, dataB;
dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
TheTest & test_pack_u()
{
SCOPED_TRACE(s);
- typedef typename V_TypeTraits<LaneType>::w_type LaneType_w;
- typedef typename V_RegTrait128<LaneType_w>::int_reg Ri2;
+ //typedef typename V_RegTraits<LaneType>::w_type LaneType_w;
+ typedef typename V_RegTraits<R>::w_reg R2;
+ typedef typename V_RegTraits<R2>::int_reg Ri2;
typedef typename Ri2::lane_type w_type;
Data<Ri2> dataA, dataB;
TheTest & test_float_math()
{
- typedef typename V_RegTrait128<LaneType>::int_reg Ri;
+ typedef typename V_RegTraits<R>::round_reg Ri;
Data<R> data1, data2, data3;
data1 *= 1.1;
data2 += 10;
TheTest & test_loadstore_fp16()
{
-#if CV_FP16 && CV_SIMD128
+#if CV_FP16 && CV_SIMD
AlignedData<R> data;
AlignedData<R> out;
- if(1 /* checkHardwareSupport(CV_CPU_FP16) */ )
- {
- // check if addresses are aligned and unaligned respectively
- EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
- EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
- EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
- EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
-
- // check some initialization methods
- R r1 = data.u;
- R r2 = v_load_f16(data.a.d);
- R r3(r2);
- EXPECT_EQ(data.u[0], r1.get0());
- EXPECT_EQ(data.a[0], r2.get0());
- EXPECT_EQ(data.a[0], r3.get0());
-
- // check some store methods
- out.a.clear();
- v_store_f16(out.a.d, r1);
- EXPECT_EQ(data.a, out.a);
- }
+ // check if addresses are aligned and unaligned respectively
+ EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+ EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+ EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+ EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+ // check some initialization methods
+ R r1 = data.u;
+ R r2 = v_load_f16(data.a.d);
+ R r3(r2);
+ EXPECT_EQ(data.u[0], r1.get0());
+ EXPECT_EQ(data.a[0], r2.get0());
+ EXPECT_EQ(data.a[0], r3.get0());
+
+ // check some store methods
+ out.a.clear();
+ v_store(out.a.d, r1);
+ EXPECT_EQ(data.a, out.a);
return *this;
#endif
TheTest & test_float_cvt_fp16()
{
-#if CV_FP16 && CV_SIMD128
- AlignedData<v_float32x4> data;
-
- if(1 /* checkHardwareSupport(CV_CPU_FP16) */)
- {
- // check conversion
- v_float32x4 r1 = v_load(data.a.d);
- v_float16x4 r2 = v_cvt_f16(r1);
- v_float32x4 r3 = v_cvt_f32(r2);
- EXPECT_EQ(0x3c00, r2.get0());
- EXPECT_EQ(r3.get0(), r1.get0());
- }
+#if CV_FP16 && CV_SIMD
+ AlignedData<v_float32> data;
+
+ // check conversion
+ v_float32 r1 = vx_load(data.a.d);
+ v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
+ v_float32 r3 = v_cvt_f32(r2);
+ EXPECT_EQ(0x3c00, r2.get0());
+ EXPECT_EQ(r3.get0(), r1.get0());
return *this;
#endif
if( depth < CV_32F )
return power == cvRound(power) && power >= 0 ? 0 : 1;
else
- return Base::get_success_error_level( test_case_idx, i, j );
+ {
+ return depth != CV_64F ? Base::get_success_error_level( test_case_idx, i, j ) : DBL_EPSILON*1024*1.1;
+ }
}
cv::Vec<float, 3> b(4, 5, 7);
cv::Matx<float, 2, 1> xQR = A.solve(b, DECOMP_QR);
cv::Matx<float, 2, 1> xSVD = A.solve(b, DECOMP_SVD);
- EXPECT_LE(cvtest::norm(xQR, xSVD, CV_RELATIVE_L2), 0.001);
+ EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 0.001);
cv::Matx<float, 2, 3> iA = A.inv(DECOMP_SVD);
- EXPECT_LE(cvtest::norm(A*iA, Matx<float, 3, 3>::eye(), CV_RELATIVE_L2), 0.6);
+ EXPECT_LE(cvtest::norm(iA*A, Matx<float, 2, 2>::eye(), NORM_L2), 1e-3);
+ EXPECT_ANY_THROW({
+ /*cv::Matx<float, 2, 1> xLU =*/ A.solve(b, DECOMP_LU);
+ std::cout << "FATAL ERROR" << std::endl;
+ });
+}
+
+TEST(Core_Solve, Matx_2_2)
+{
+ cv::Matx<float, 2, 2> A(
+ 2, 1,
+ 1, 1
+ );
+ cv::Vec<float, 2> b(4, 5);
+ cv::Matx<float, 2, 1> xLU = A.solve(b, DECOMP_LU);
+ cv::Matx<float, 2, 1> xQR = A.solve(b, DECOMP_QR);
+ cv::Matx<float, 2, 1> xSVD = A.solve(b, DECOMP_SVD);
+ EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+ EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+ cv::Matx<float, 2, 2> iA = A.inv(DECOMP_SVD);
+ EXPECT_LE(cvtest::norm(iA*A, Matx<float, 2, 2>::eye(), NORM_L2), 1e-3);
+}
+TEST(Core_Solve, Matx_3_3)
+{
+ cv::Matx<float, 3, 3> A(
+ 2, 1, 0,
+ 0, 1, 1,
+ 1, 0, 1
+ );
+ cv::Vec<float, 3> b(4, 5, 6);
+ cv::Matx<float, 3, 1> xLU = A.solve(b, DECOMP_LU);
+ cv::Matx<float, 3, 1> xQR = A.solve(b, DECOMP_QR);
+ cv::Matx<float, 3, 1> xSVD = A.solve(b, DECOMP_SVD);
+ EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+ EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+ cv::Matx<float, 3, 3> iA = A.inv(DECOMP_SVD);
+ EXPECT_LE(cvtest::norm(iA*A, Matx<float, 3, 3>::eye(), NORM_L2), 1e-3);
+}
+
+TEST(Core_Solve, Matx_4_4)
+{
+ cv::Matx<float, 4, 4> A(
+ 2, 1, 0, 4,
+ 0, 1, 1, 3,
+ 1, 0, 1, 2,
+ 2, 2, 0, 1
+ );
+ cv::Vec<float, 4> b(4, 5, 6, 7);
+ cv::Matx<float, 4, 1> xLU = A.solve(b, DECOMP_LU);
+ cv::Matx<float, 4, 1> xQR = A.solve(b, DECOMP_QR);
+ cv::Matx<float, 4, 1> xSVD = A.solve(b, DECOMP_SVD);
+ EXPECT_LE(cvtest::norm(xQR, xSVD, NORM_L2 | NORM_RELATIVE), 1e-3);
+ EXPECT_LE(cvtest::norm(xQR, xLU, NORM_L2 | NORM_RELATIVE), 1e-3);
+ cv::Matx<float, 4, 4> iA = A.inv(DECOMP_SVD);
+ EXPECT_LE(cvtest::norm(iA*A, Matx<float, 4, 4>::eye(), NORM_L2), 1e-3);
}
softdouble naiveExp(softdouble x)
Size size(2, 5);
TestType<float>(size, 1.f);
- cv::Vec3f val1 = 1.f;
+ cv::Vec3f val1(1.f);
TestType<cv::Vec3f>(size, val1);
- cv::Matx31f val2 = 1.f;
+ cv::Matx31f val2(1.f);
TestType<cv::Matx31f>(size, val2);
- cv::Matx41f val3 = 1.f;
+ cv::Matx41f val3(1.f);
TestType<cv::Matx41f>(size, val3);
- cv::Matx32f val4 = 1.f;
+ cv::Matx32f val4(1.f);
TestType<cv::Matx32f>(size, val4);
}
catch (const test_excep& e)
// TODO: OpenCL target support more fusion styles.
if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
(!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
- ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
+ ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
+ ld.layerInstance->type != "Concat")) )
continue;
Ptr<Layer>& currLayer = ld.layerInstance;
ld.outputBlobs.size() == 1 )
{
Mat& output = ld.outputBlobs[0];
+ UMat umat_output;
+ if (!ld.outputBlobsWrappers.empty() &&
+ (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
+ {
+ size_t i, ninputs = ld.inputBlobsId.size();
+ bool conv_layer = true;
+ for( i = 0; i < ninputs; i++ )
+ {
+ LayerPin pin = ld.inputBlobsId[i];
+ LayerData* inp_i_data = &layers[pin.lid];
+ while(inp_i_data->skip &&
+ inp_i_data->inputBlobsId.size() == 1 &&
+ inp_i_data->consumers.size() == 1)
+ {
+ pin = inp_i_data->inputBlobsId[0];
+ inp_i_data = &layers[pin.lid];
+ }
+ conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
+ }
+ if (!conv_layer)
+ continue;
+ std::vector<UMat> umat_outputBlobs;
+ umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+ umat_output = umat_outputBlobs[0];
+ }
// TODO: in general, this optimization can always be done, but
// many layers currently check that the input/output blobs are
// Allocate new memory to prevent collisions during memory
// reusing (see https://github.com/opencv/opencv/pull/10456).
output = output.clone();
+ if (preferableBackend == DNN_BACKEND_OPENCV &&
+ IS_DNN_OPENCL_TARGET(preferableTarget))
+ {
+ std::vector<UMat> umats(1);
+ umat_output = umat_output.clone();
+ umats[0] = umat_output;
+ OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
+ }
Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
int ofs = 0;
for( i = 0; i < ninputs; i++ )
CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
Mat* oldPtr = &curr_output;
curr_output = output_slice;
+ if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
+ {
+ std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
+ umats[pin.oid] = umat_output(chrange);
+ OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
+ }
// Layers that refer old input Mat will refer to the
// new data but the same Mat object.
CV_Assert(curr_output.data == output_slice.data, oldPtr == &curr_output);
for (int i = 0; i < num; i++)
confPreds.push_back(Mat(2, shape, CV_32F));
- UMat umat = inp1.reshape(1, num * numPredsPerClass);
+ shape[0] = num * numPredsPerClass;
+ shape[1] = inp1.total() / shape[0];
+ UMat umat = inp1.reshape(1, 2, &shape[0]);
for (int i = 0; i < num; ++i)
{
Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
// Decode all loc predictions to bboxes
bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
_shareLocation, _numLocClasses, _backgroundLabelId,
- _codeType, _varianceEncodedInTarget, false,
+ _codeType, _varianceEncodedInTarget, _clip,
allDecodedBBoxes);
if (!ret)
return false;
outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals);
+ UMat& src = inputs[0];
+ UMat& dstMat = outputs[0];
+ int axis = clamp(axisRaw, src.dims);
+
if (softmaxOp.empty())
{
OCL4DNNSoftmaxConfig config;
-
config.in_shape = shape(inputs[0]);
- config.axis = axisRaw;
- config.channels = inputs[0].size[axisRaw];
+ config.axis = axis;
+ config.channels = inputs[0].size[axis];
config.logsoftmax = logSoftMax;
config.use_half = use_half;
softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
}
- UMat& src = inputs[0];
- UMat& dstMat = outputs[0];
-
if (softmaxOp->Forward(src, dstMat))
return true;
UMat& bufMat = internals[0];
- int axis = clamp(axisRaw, src.dims);
MatShape s = shape(src);
size_t outerSize = total(s, 0, axis);
size_t channels = src.size[axis];
ret = k.run(1, globalsize, localsize, false);
}
- if ((row_size % 4) != 0 && ret)
+ if (row_size < 4 || ((row_size % 4) != 0 && ret))
{
String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
cl_int err;
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
- region.origin = offset * element_size;
+ region.origin = offset * element_size + buffer.offset;
region.size = size * element_size;
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
return false;
int32_t bias_offset;
+ int32_t element_size = use_half_ ? sizeof(short) : sizeof(float);
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
if (!swizzleWeight(weight, config->workItem_output[2], false))
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+ kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ kernel.set(argIdx++, (int)(top.offset / element_size));
}
kernel.set(argIdx++, (uint16_t)width_);
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+ kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ kernel.set(argIdx++, (int)(top.offset / element_size));
}
kernel.set(argIdx++, (uint16_t)width_);
if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ kernel.set(argIdx++, (int)(top.offset / element_size));
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (void *)NULL);
kernel.set(argIdx++, bias_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ kernel.set(argIdx++, (int)(top.offset / element_size));
kernel.set(argIdx++, output_image_offset);
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
int kernel_offset,
__global Dtype* bias,
const int bias_offset,
- __global Dtype* convolved_image,
+ __global Dtype* convolved_image_base,
+ const int convolved_image_base_offset,
const int convolved_image_offset,
const ushort input_width,
const ushort input_height,
const ushort pad_h
)
{
+ __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
const int outputX = get_global_id(0);
const int outputY = get_global_id(1);
const int kernelNum = get_global_id(2) * ZPAR;
__global Dtype* inputs,
__global Dtype* weights,
BIAS_KERNEL_ARG
- __global Dtype* outputs,
+ __global Dtype* outputs_base,
+ const int outputs_offset,
const ushort input_width,
const ushort input_height,
const ushort output_width,
const ushort output_height)
{
+ __global Dtype* outputs = outputs_base + outputs_offset;
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
const __global Dtype *src0, \
const __global Dtype *src1, \
BIAS_KERNEL_ARG \
- __global Dtype *dst, \
+ __global Dtype *dst_base, \
+ const int dst_offset, \
const ushort input_width, \
const ushort input_height, \
const ushort output_width, \
__attribute__((intel_reqd_sub_group_size(8)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
+ __global Dtype *dst = dst_base + dst_offset;
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
__attribute__((intel_reqd_sub_group_size(8)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
+ __global Dtype *dst = dst_base + dst_offset;
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
+ __global Dtype *dst = dst_base + dst_offset;
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
+ __global Dtype *dst = dst_base + dst_offset;
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
__global Dtype* image_data,
__global Dtype* kernel_data,
BIAS_KERNEL_ARG
- __global Dtype* convolved_image,
+ __global Dtype* convolved_image_base,
+ const int convolved_image_offset,
const ushort input_width,
const ushort input_height,
const ushort output_width,
const ushort output_height) {
-
+ __global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
const int outputX = get_global_id(0);
const int outputY = get_global_id(1);
const int outputZ = get_global_id(2);
int backendId = get<0>(get<7>(GetParam()));
int targetId = get<1>(get<7>(GetParam()));
- if ((backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD) ||
- (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16))
+ if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD)
+ throw SkipTestException("");
+
+ // TODO: unstable test cases
+ if (backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) &&
+ inChannels == 6 && outChannels == 9 && group == 1 && inSize == Size(5, 6) &&
+ kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1) && dilation == Size(1, 1) &&
+ hasBias)
throw SkipTestException("");
int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
bool hasBias = get<3>(GetParam());
int backendId = get<0>(get<4>(GetParam()));
int targetId = get<1>(get<4>(GetParam()));
- if (backendId == DNN_BACKEND_INFERENCE_ENGINE ||
- (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16))
+ if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
throw SkipTestException("");
Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
int backendId = get<0>(get<4>(GetParam()));
int targetId = get<1>(get<4>(GetParam()));
- if (backendId == DNN_BACKEND_OPENCV &&
- (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
- throw SkipTestException("");
-
Net net;
std::vector<int> convLayerIds(numConv);
runTensorFlowNet("l2_normalize_3d");
}
-typedef testing::TestWithParam<Target> Test_TensorFlow_nets;
+class Test_TensorFlow_nets : public DNNTestLayer {};
TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
{
+ checkBackend();
+ if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
+ (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+ throw SkipTestException("");
+
std::string netPath = findDataFile("dnn/ssd_mobilenet_v1_coco.pb", false);
std::string netConfig = findDataFile("dnn/ssd_mobilenet_v1_coco.pbtxt", false);
std::string imgPath = findDataFile("dnn/street.png", false);
outNames[1] = "concat_1";
outNames[2] = "detection_out";
- std::vector<Mat> target(outNames.size());
+ std::vector<Mat> refs(outNames.size());
for (int i = 0; i < outNames.size(); ++i)
{
std::string path = findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco." + outNames[i] + ".npy", false);
- target[i] = blobFromNPY(path);
+ refs[i] = blobFromNPY(path);
}
Net net = readNetFromTensorflow(netPath, netConfig);
- net.setPreferableBackend(DNN_BACKEND_OPENCV);
- net.setPreferableTarget(GetParam());
+ net.setPreferableBackend(backend);
+ net.setPreferableTarget(target);
net.setInput(inp);
std::vector<Mat> output;
net.forward(output, outNames);
- normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
- normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
- normAssertDetections(target[2], output[2], "", 0.2);
+ normAssert(refs[0].reshape(1, 1), output[0].reshape(1, 1), "", 1e-5, 1.5e-4);
+ normAssert(refs[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 3e-4);
+ normAssertDetections(refs[2], output[2], "", 0.2);
}
TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
{
+ checkBackend();
std::string proto = findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pbtxt", false);
std::string model = findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pb", false);
Mat img = imread(findDataFile("dnn/street.png", false));
Mat blob = blobFromImage(img, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false);
- net.setPreferableBackend(DNN_BACKEND_OPENCV);
- net.setPreferableTarget(GetParam());
+ net.setPreferableBackend(backend);
+ net.setPreferableTarget(target);
net.setInput(blob);
// Output has shape 1x1xNx7 where N - number of detections.
0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
- normAssertDetections(ref, out, "", 0.5);
+ double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : default_l1;
+ double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.025 : default_lInf;
+ normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff);
}
TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
{
+ checkBackend();
+ if (backend == DNN_BACKEND_INFERENCE_ENGINE ||
+ (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+ throw SkipTestException("");
+
std::string proto = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", false);
std::string model = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false);
Net net = readNetFromTensorflow(model, proto);
- net.setPreferableBackend(DNN_BACKEND_OPENCV);
+ net.setPreferableBackend(backend);
+ net.setPreferableTarget(target);
Mat img = imread(findDataFile("dnn/dog416.png", false));
Mat blob = blobFromImage(img, 1.0f / 127.5, Size(800, 600), Scalar(127.5, 127.5, 127.5), true, false);
TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
{
+ checkBackend();
+ if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
+ (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD))
+ throw SkipTestException("");
+
std::string proto = findDataFile("dnn/opencv_face_detector.pbtxt", false);
std::string model = findDataFile("dnn/opencv_face_detector_uint8.pb", false);
Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
Mat blob = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
- net.setPreferableBackend(DNN_BACKEND_OPENCV);
- net.setPreferableTarget(GetParam());
-
+ net.setPreferableBackend(backend);
+ net.setPreferableTarget(target);
net.setInput(blob);
// Output has shape 1x1xNx7 where N - number of detections.
// An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
- normAssertDetections(ref, out, "", 0.9, 3.4e-3, 1e-2);
+ double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 4e-3 : 3.4e-3;
+ double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.017 : 1e-2;
+ normAssertDetections(ref, out, "", 0.9, scoreDiff, iouDiff);
}
// inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png')
// np.save('east_text_detection.geometry.npy', geometry)
TEST_P(Test_TensorFlow_nets, EAST_text_detection)
{
+ checkBackend();
+ if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+ throw SkipTestException("");
+
std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false);
std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false);
std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false);
Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false));
- net.setPreferableTarget(GetParam());
+ net.setPreferableBackend(backend);
+ net.setPreferableTarget(target);
Mat img = imread(imgPath);
Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 1e-4, 3e-3);
}
-INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
+INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, dnnBackendsAndTargets());
TEST_P(Test_TensorFlow_layers, fp16_weights)
{
"double[]": { "j_type" : "double[]", "jn_type" : "double[]", "jni_type" : "jdoubleArray", "suffix" : "_3D" }
}
+# Defines a rule to add extra prefixes for names from specific namespaces.
+# In example, cv::fisheye::stereoRectify from namespace fisheye is wrapped as fisheye_stereoRectify
+namespaces_dict = {}
+
# { class : { func : {j_code, jn_code, cpp_code} } }
ManualFuncs = {}
self.cname = self.name.replace(".", "::")
self.value = decl[1]
self.addedManually = addedManually
+ if self.namespace in namespaces_dict:
+ self.name = '%s_%s' % (namespaces_dict[self.namespace], self.name)
def __repr__(self):
return Template("CONST $name=$value$manual").substitute(name=self.name,
class FuncInfo(GeneralInfo):
def __init__(self, decl, namespaces=[]): # [ funcname, return_ctype, [modifiers], [args] ]
GeneralInfo.__init__(self, "func", decl, namespaces)
- self.cname = self.name.replace(".", "::")
+ self.cname = decl[0].replace(".", "::")
self.jname = self.name
self.isconstructor = self.name == self.classname
if "[" in self.name:
self.jname = "getelem"
+ if self.namespace in namespaces_dict:
+ self.jname = '%s_%s' % (namespaces_dict[self.namespace], self.jname)
for m in decl[2]:
if m.startswith("="):
self.jname = m[1:]
# java part:
#java doc comment
- f_name = fi.name
+ f_name = fi.jname
if fi.classname:
- f_name = fi.classname + "::" + fi.name
+ f_name = fi.classname + "::" + fi.jname
java_doc = "//javadoc: " + f_name + "(%s)" % ", ".join([a.name for a in args if a.ctype])
j_code.write(" "*4 + java_doc + "\n")
j_signatures.append(j_signature)
# processing args with default values
- if not args or not args[-1].defval:
+ if args and args[-1].defval:
+ args.pop()
+ else:
break
- while args and args[-1].defval:
- # 'smart' overloads filtering
- a = args.pop()
- if a.name in ('mask', 'dtype', 'ddepth', 'lineType', 'borderType', 'borderMode', 'criteria'):
- break
type_dict.update(gen_type_dict.get("type_dict", {}))
ManualFuncs.update(gen_type_dict.get("ManualFuncs", {}))
func_arg_fix.update(gen_type_dict.get("func_arg_fix", {}))
+ namespaces_dict.update(gen_type_dict.get("namespaces_dict", {}))
if 'module_j_code' in gen_type_dict:
module_j_code = read_contents(checkFileRemap(os.path.join(misc_location, gen_type_dict['module_j_code'])))
if 'module_jn_code' in gen_type_dict:
switch( code )
{
case CMP_EPS_BIG_DIFF:
- sprintf( msg, "%s: Too big difference (=%g)", desc, diff );
+ sprintf( msg, "%s: Too big difference (=%g > %g)", desc, diff, success_err_level );
code = TS::FAIL_BAD_ACCURACY;
break;
case CMP_EPS_INVALID_TEST_DATA: