--- /dev/null
- #if CV_VERSION_MAJOR == 3
- r0 = op::pre(v_src2, r0);
- r1 = op::pre(v_src2s, r1);
- #endif
-
+ // This file is part of OpenCV project.
+ // It is subject to the license terms in the LICENSE file found in the top-level directory
+ // of this distribution and at http://opencv.org/license.html
+
+ #include "opencv2/core/hal/intrin.hpp"
+
+ //=========================================
+ // Declare & Define & Dispatch in one step
+ //=========================================
+
+ // ARITHM_DISPATCHING_ONLY defined by arithm dispatch file
+
+ #undef ARITHM_DECLARATIONS_ONLY
+ #ifdef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+ #define ARITHM_DECLARATIONS_ONLY
+ #endif
+
+ #undef ARITHM_DEFINITIONS_ONLY
+ #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && !defined(ARITHM_DISPATCHING_ONLY)
+ #define ARITHM_DEFINITIONS_ONLY
+ #endif
+
+ #ifdef ARITHM_DECLARATIONS_ONLY
+ #undef DEFINE_SIMD
+ #define DEFINE_SIMD(fun_name, c_type, ...) \
+ DECLARE_SIMD_FUN(fun_name, c_type)
+ #endif // ARITHM_DECLARATIONS_ONLY
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+ #undef DEFINE_SIMD
+ #define DEFINE_SIMD(fun_name, c_type, v_type, ...) \
+ DECLARE_SIMD_FUN(fun_name, c_type) \
+ DEFINE_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ #ifdef ARITHM_DISPATCHING_ONLY
+ #undef DEFINE_SIMD
+ #define DEFINE_SIMD(fun_name, c_type, v_type, ...) \
+ DISPATCH_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
+ #endif // ARITHM_DISPATCHING_ONLY
+
+ // workaround when neon miss support of double precision
+ #undef DEFINE_NOSIMD
+ #ifdef ARITHM_DEFINITIONS_ONLY
+ #define DEFINE_NOSIMD(fun_name, c_type, ...) \
+ DECLARE_SIMD_FUN(fun_name, c_type) \
+ DEFINE_NOSIMD_FUN(fun_name, c_type, __VA_ARGS__)
+ #else
+ #define DEFINE_NOSIMD DEFINE_SIMD
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ #ifndef SIMD_GUARD
+
+ #define DEFINE_SIMD_U8(fun, ...) \
+ DEFINE_SIMD(__CV_CAT(fun, 8u), uchar, v_uint8, __VA_ARGS__)
+
+ #define DEFINE_SIMD_S8(fun, ...) \
+ DEFINE_SIMD(__CV_CAT(fun, 8s), schar, v_int8, __VA_ARGS__)
+
+ #define DEFINE_SIMD_U16(fun, ...) \
+ DEFINE_SIMD(__CV_CAT(fun, 16u), ushort, v_uint16, __VA_ARGS__)
+
+ #define DEFINE_SIMD_S16(fun, ...) \
+ DEFINE_SIMD(__CV_CAT(fun, 16s), short, v_int16, __VA_ARGS__)
+
+ #define DEFINE_SIMD_S32(fun, ...) \
+ DEFINE_SIMD(__CV_CAT(fun, 32s), int, v_int32, __VA_ARGS__)
+
+ #define DEFINE_SIMD_F32(fun, ...) \
+ DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
+
+ #if CV_SIMD_64F
+ #define DEFINE_SIMD_F64(fun, ...) \
+ DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
+ #else
+ #define DEFINE_SIMD_F64(fun, ...) \
+ DEFINE_NOSIMD(__CV_CAT(fun, 64f), double, __VA_ARGS__)
+ #endif
+
+ #define DEFINE_SIMD_SAT(fun, ...) \
+ DEFINE_SIMD_U8(fun, __VA_ARGS__) \
+ DEFINE_SIMD_S8(fun, __VA_ARGS__) \
+ DEFINE_SIMD_U16(fun, __VA_ARGS__) \
+ DEFINE_SIMD_S16(fun, __VA_ARGS__)
+
+ #define DEFINE_SIMD_NSAT(fun, ...) \
+ DEFINE_SIMD_S32(fun, __VA_ARGS__) \
+ DEFINE_SIMD_F32(fun, __VA_ARGS__) \
+ DEFINE_SIMD_F64(fun, __VA_ARGS__)
+
+ #define DEFINE_SIMD_ALL(fun, ...) \
+ DEFINE_SIMD_SAT(fun, __VA_ARGS__) \
+ DEFINE_SIMD_NSAT(fun, __VA_ARGS__)
+
+ #endif // SIMD_GUARD
+
+ ///////////////////////////////////////////////////////////////////////////
+
+ namespace cv { namespace hal {
+
+ #ifndef ARITHM_DISPATCHING_ONLY
+ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+ #endif
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+
+ #if !CV_SIMD_64F
+ typedef int v_float64; // dummy
+ #endif
+
+ //=======================================
+ // Utility
+ //=======================================
+
+ /** add **/
+ template<typename T>
+ static inline T c_add(T a, T b)
+ { return saturate_cast<T>(a + b); }
+ template<>
+ inline uchar c_add<uchar>(uchar a, uchar b)
+ { return CV_FAST_CAST_8U(a + b); }
+ // scale
+ template<typename T1, typename T2>
+ static inline T1 c_add(T1 a, T1 b, T2 scalar)
+ { return saturate_cast<T1>((T2)a * scalar + b); }
+ template<>
+ inline uchar c_add<uchar, float>(uchar a, uchar b, float scalar)
+ { return saturate_cast<uchar>(CV_8TO32F(a) * scalar + b); }
+ // weight
+ template<typename T1, typename T2>
+ static inline T1 c_add(T1 a, T1 b, T2 alpha, T2 beta, T2 gamma)
+ { return saturate_cast<T1>(a * alpha + b * beta + gamma); }
+ template<>
+ inline uchar c_add<uchar, float>(uchar a, uchar b, float alpha, float beta, float gamma)
+ { return saturate_cast<uchar>(CV_8TO32F(a) * alpha + CV_8TO32F(b) * beta + gamma); }
+
+ /** sub **/
+ template<typename T>
+ static inline T c_sub(T a, T b)
+ { return saturate_cast<T>(a - b); }
+ template<>
+ inline uchar c_sub<uchar>(uchar a, uchar b)
+ { return CV_FAST_CAST_8U(a - b); }
+
+ /** max **/
+ template<typename T>
+ static inline T c_max(T a, T b)
+ { return std::max(a, b); }
+ template<>
+ inline uchar c_max<uchar>(uchar a, uchar b)
+ { return CV_MAX_8U(a, b); }
+
+ /** min **/
+ template<typename T>
+ static inline T c_min(T a, T b)
+ { return std::min(a, b); }
+ template<>
+ inline uchar c_min<uchar>(uchar a, uchar b)
+ { return CV_MIN_8U(a, b); }
+
+ /** absdiff **/
+ template<typename T>
+ static inline T c_absdiff(T a, T b)
+ { return a > b ? a - b : b - a; }
+ template<>
+ inline schar c_absdiff(schar a, schar b)
+ { return saturate_cast<schar>(std::abs(a - b)); }
+ template<>
+ inline short c_absdiff(short a, short b)
+ { return saturate_cast<short>(std::abs(a - b)); }
+ // specializations to prevent "-0" results
+ template<>
+ inline float c_absdiff<float>(float a, float b)
+ { return std::abs(a - b); }
+ template<>
+ inline double c_absdiff<double>(double a, double b)
+ { return std::abs(a - b); }
+
+ /** multiply **/
+ template<typename T>
+ static inline T c_mul(T a, T b)
+ { return saturate_cast<T>(a * b); }
+ template<>
+ inline uchar c_mul<uchar>(uchar a, uchar b)
+ { return CV_FAST_CAST_8U(a * b); }
+ // scale
+ template<typename T1, typename T2>
+ static inline T1 c_mul(T1 a, T1 b, T2 scalar)
+ { return saturate_cast<T1>(scalar * (T2)a * b); }
+ template<>
+ inline uchar c_mul<uchar, float>(uchar a, uchar b, float scalar)
+ { return saturate_cast<uchar>(scalar * CV_8TO32F(a) * CV_8TO32F(b)); }
+
+ /** divide & reciprocal **/
+ template<typename T1, typename T2>
+ static inline T2 c_div(T1 a, T2 b)
+ { return saturate_cast<T2>(a / b); }
+ // recip
+ template<>
+ inline uchar c_div<float, uchar>(float a, uchar b)
+ { return saturate_cast<uchar>(a / CV_8TO32F(b)); }
+ // scale
+ template<typename T1, typename T2>
+ static inline T1 c_div(T1 a, T1 b, T2 scalar)
+ { return saturate_cast<T1>(scalar * (T2)a / b); }
+ template<>
+ inline uchar c_div<uchar, float>(uchar a, uchar b, float scalar)
+ { return saturate_cast<uchar>(scalar * CV_8TO32F(a) / CV_8TO32F(b)); }
+
+ //=======================================
+ // Arithmetic and logical operations
+ // +, -, *, /, &, |, ^, ~, abs ...
+ //=======================================
+
+ ///////////////////////////// Operations //////////////////////////////////
+
+ // Add
+ template<typename T1, typename Tvec>
+ struct op_add
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a + b; }
+ static inline T1 r(T1 a, T1 b)
+ { return c_add(a, b); }
+ };
+
+ // Subtract
+ template<typename T1, typename Tvec>
+ struct op_sub
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a - b; }
+ static inline T1 r(T1 a, T1 b)
+ { return c_sub(a, b); }
+ };
+
+ // Max & Min
+ template<typename T1, typename Tvec>
+ struct op_max
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return v_max(a, b); }
+ static inline T1 r(T1 a, T1 b)
+ { return c_max(a, b); }
+ };
+
+ template<typename T1, typename Tvec>
+ struct op_min
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return v_min(a, b); }
+ static inline T1 r(T1 a, T1 b)
+ { return c_min(a, b); }
+ };
+
+ // Absolute difference
+ template<typename T1, typename Tvec>
+ struct op_absdiff
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return v_absdiff(a, b); }
+ static inline T1 r(T1 a, T1 b)
+ { return c_absdiff(a, b); }
+ };
+ // Signed absolute difference, 's'
+ template<>
+ struct op_absdiff<schar, v_int8>
+ {
+ static inline v_int8 r(const v_int8& a, const v_int8& b)
+ { return v_absdiffs(a, b); }
+ static inline schar r(schar a, schar b)
+ { return c_absdiff(a, b); }
+ };
+ template<>
+ struct op_absdiff<short, v_int16>
+ {
+ static inline v_int16 r(const v_int16& a, const v_int16& b)
+ { return v_absdiffs(a, b); }
+ static inline short r(short a, short b)
+ { return c_absdiff(a, b); }
+ };
+ template<>
+ struct op_absdiff<int, v_int32>
+ {
+ static inline v_int32 r(const v_int32& a, const v_int32& b)
+ { return v_reinterpret_as_s32(v_absdiff(a, b)); }
+ static inline int r(int a, int b)
+ { return c_absdiff(a, b); }
+ };
+
+ // Logical
+ template<typename T1, typename Tvec>
+ struct op_or
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a | b; }
+ static inline T1 r(T1 a, T1 b)
+ { return a | b; }
+ };
+ template<typename T1, typename Tvec>
+ struct op_xor
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a ^ b; }
+ static inline T1 r(T1 a, T1 b)
+ { return a ^ b; }
+ };
+ template<typename T1, typename Tvec>
+ struct op_and
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a & b; }
+ static inline T1 r(T1 a, T1 b)
+ { return a & b; }
+ };
+ template<typename T1, typename Tvec>
+ struct op_not
+ {
+ // ignored b from loader level
+ static inline Tvec r(const Tvec& a)
+ { return ~a; }
+ static inline T1 r(T1 a, T1)
+ { return ~a; }
+ };
+
+ //////////////////////////// Loaders /////////////////////////////////
+
+ #if CV_SIMD
+
+ template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ struct bin_loader
+ {
+ typedef OP<T1, Tvec> op;
+
+ static inline void l(const T1* src1, const T1* src2, T1* dst)
+ {
+ Tvec a = vx_load(src1);
+ Tvec b = vx_load(src2);
+ v_store(dst, op::r(a, b));
+ }
+
+ static inline void la(const T1* src1, const T1* src2, T1* dst)
+ {
+ Tvec a = vx_load_aligned(src1);
+ Tvec b = vx_load_aligned(src2);
+ v_store_aligned(dst, op::r(a, b)); // todo: try write without cache
+ }
+
+ static inline void l64(const T1* src1, const T1* src2, T1* dst)
+ {
+ Tvec a = vx_load_low(src1), b = vx_load_low(src2);
+ v_store_low(dst, op::r(a, b));
+ }
+ };
+
+ // void src2 for operation "not"
+ template<typename T1, typename Tvec>
+ struct bin_loader<op_not, T1, Tvec>
+ {
+ typedef op_not<T1, Tvec> op;
+
+ static inline void l(const T1* src1, const T1*, T1* dst)
+ {
+ Tvec a = vx_load(src1);
+ v_store(dst, op::r(a));
+ }
+
+ static inline void la(const T1* src1, const T1*, T1* dst)
+ {
+ Tvec a = vx_load_aligned(src1);
+ v_store_aligned(dst, op::r(a));
+ }
+
+ static inline void l64(const T1* src1, const T1*, T1* dst)
+ {
+ Tvec a = vx_load_low(src1);
+ v_store_low(dst, op::r(a));
+ }
+ };
+
+ #endif // CV_SIMD
+
+ //////////////////////////// Loops /////////////////////////////////
+
+ template<typename T1, typename T2>
+ static inline bool is_aligned(const T1* src1, const T1* src2, const T2* dst)
+ { return (((size_t)src1|(size_t)src2|(size_t)dst) & (CV_SIMD_WIDTH - 1)) == 0; }
+
+ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
+ {
+ typedef OP<T1, Tvec> op;
+ #if CV_SIMD
+ typedef bin_loader<OP, T1, Tvec> ldr;
+ enum {wide_step = Tvec::nlanes};
+ #if !CV_NEON && CV_SIMD_WIDTH == 16
+ enum {wide_step_l = wide_step * 2};
+ #else
+ enum {wide_step_l = wide_step};
+ #endif
+ #endif // CV_SIMD
+
+ step1 /= sizeof(T1);
+ step2 /= sizeof(T1);
+ step /= sizeof(T1);
+
+ for (; height--; src1 += step1, src2 += step2, dst += step)
+ {
+ int x = 0;
+
+ #if CV_SIMD
+ #if !CV_NEON
+ if (is_aligned(src1, src2, dst))
+ {
+ for (; x <= width - wide_step_l; x += wide_step_l)
+ {
+ ldr::la(src1 + x, src2 + x, dst + x);
+ #if !CV_NEON && CV_SIMD_WIDTH == 16
+ ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
+ #endif
+ }
+ }
+ else
+ #endif
+ for (; x <= width - wide_step_l; x += wide_step_l)
+ {
+ ldr::l(src1 + x, src2 + x, dst + x);
+ #if !CV_NEON && CV_SIMD_WIDTH == 16
+ ldr::l(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
+ #endif
+ }
+
+ #if CV_SIMD_WIDTH == 16
+ for (; x <= width - 8/(int)sizeof(T1); x += 8/(int)sizeof(T1))
+ {
+ ldr::l64(src1 + x, src2 + x, dst + x);
+ }
+ #endif
+ #endif // CV_SIMD
+
+ #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+ for (; x <= width - 4; x += 4)
+ {
+ T1 t0 = op::r(src1[x], src2[x]);
+ T1 t1 = op::r(src1[x + 1], src2[x + 1]);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], src2[x + 2]);
+ t1 = op::r(src1[x + 3], src2[x + 3]);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+ #endif
+
+ for (; x < width; x++)
+ dst[x] = op::r(src1[x], src2[x]);
+ }
+
+ vx_cleanup();
+ }
+
+ #if !CV_SIMD_64F
+ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
+ {
+ typedef OP<T1, Tvec/*dummy*/> op;
+
+ step1 /= sizeof(T1);
+ step2 /= sizeof(T1);
+ step /= sizeof(T1);
+
+ for (; height--; src1 += step1, src2 += step2, dst += step)
+ {
+ int x = 0;
+
+ for (; x <= width - 4; x += 4)
+ {
+ T1 t0 = op::r(src1[x], src2[x]);
+ T1 t1 = op::r(src1[x + 1], src2[x + 1]);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], src2[x + 2]);
+ t1 = op::r(src1[x + 3], src2[x + 3]);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+
+ for (; x < width; x++)
+ dst[x] = op::r(src1[x], src2[x]);
+ }
+ }
+ #define BIN_LOOP64F bin_loop_nosimd
+ #else
+ #define BIN_LOOP64F bin_loop
+ #endif //!CV_SIMD_64F
+
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ ////////////////////////////////////////////////////////////////////////////////////
+
+ #ifndef SIMD_GUARD
+ #define BIN_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+ _T1* dst, size_t step, int width, int height
+
+ #define BIN_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+ #endif // SIMD_GUARD
+
+ #undef DECLARE_SIMD_FUN
+ #define DECLARE_SIMD_FUN(fun, _T1) void fun(BIN_ARGS(_T1));
+
+ #undef DISPATCH_SIMD_FUN
+ #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, _OP) \
+ void fun(BIN_ARGS(_T1), void*) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), BIN_ARGS_PASS) \
+ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), BIN_ARGS_PASS) \
+ CV_CPU_DISPATCH(fun, (BIN_ARGS_PASS), CV_CPU_DISPATCH_MODES_ALL); \
+ }
+
+ #undef DEFINE_SIMD_FUN
+ #define DEFINE_SIMD_FUN(fun, _T1, _Tvec, _OP) \
+ void fun(BIN_ARGS(_T1)) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ bin_loop<_OP, _T1, _Tvec>(BIN_ARGS_PASS); \
+ }
+
+ #undef DEFINE_NOSIMD_FUN
+ #define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \
+ void fun(BIN_ARGS(_T1)) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ bin_loop_nosimd<_OP, _T1, v_float64>(BIN_ARGS_PASS); \
+ }
+
+ DEFINE_SIMD_ALL(add, op_add)
+ DEFINE_SIMD_ALL(sub, op_sub)
+
+ DEFINE_SIMD_ALL(min, op_min)
+ DEFINE_SIMD_ALL(max, op_max)
+
+ DEFINE_SIMD_ALL(absdiff, op_absdiff)
+
+ DEFINE_SIMD_U8(or, op_or)
+ DEFINE_SIMD_U8(xor, op_xor)
+ DEFINE_SIMD_U8(and, op_and)
+
+ // One source!, an exception for operation "not"
+ // we could use macros here but it's better to implement it
+ // with that way to give more clarification
+ // about how macroS "DEFINE_SIMD_*" are works
+
+ #if defined(ARITHM_DECLARATIONS_ONLY) || defined(ARITHM_DEFINITIONS_ONLY)
+ void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+ #endif
+ #ifdef ARITHM_DEFINITIONS_ONLY
+ void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+ {
+ CV_INSTRUMENT_REGION();
+ bin_loop<op_not, uchar, v_uint8>(src1, step1, src2, step2, dst, step, width, height);
+ }
+ #endif
+ #ifdef ARITHM_DISPATCHING_ONLY
+ void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void*)
+ {
+ CV_INSTRUMENT_REGION();
+ CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
+ ARITHM_CALL_IPP(arithm_ipp_not8u, src1, step1, dst, step, width, height)
+ CV_CPU_DISPATCH(not8u, (src1, step1, src2, step2, dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL);
+ }
+ #endif
+
+ //=======================================
+ // Compare
+ //=======================================
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+
+ ///////////////////////////// Operations //////////////////////////////////
+
+ template<typename T1, typename Tvec>
+ struct op_cmplt
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a < b; }
+ static inline uchar r(T1 a, T1 b)
+ { return (uchar)-(int)(a < b); }
+ };
+
+ template<typename T1, typename Tvec>
+ struct op_cmple
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a <= b; }
+ static inline uchar r(T1 a, T1 b)
+ { return (uchar)-(int)(a <= b); }
+ };
+
+ template<typename T1, typename Tvec>
+ struct op_cmpeq
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a == b; }
+ static inline uchar r(T1 a, T1 b)
+ { return (uchar)-(int)(a == b); }
+ };
+
+ template<typename T1, typename Tvec>
+ struct op_cmpne
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a != b; }
+ static inline uchar r(T1 a, T1 b)
+ { return (uchar)-(int)(a != b); }
+ };
+
+ //////////////////////////// Loaders /////////////////////////////////
+
+ #if CV_SIMD
+ // todo: add support for RW alignment & stream
+ template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ struct cmp_loader_n
+ {
+ void l(const T1* src1, const T1* src2, uchar* dst);
+ };
+
+ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ struct cmp_loader_n<sizeof(uchar), OP, T1, Tvec>
+ {
+ typedef OP<T1, Tvec> op;
+
+ static inline void l(const T1* src1, const T1* src2, uchar* dst)
+ {
+ Tvec a = vx_load(src1);
+ Tvec b = vx_load(src2);
+ v_store(dst, v_reinterpret_as_u8(op::r(a, b)));
+ }
+ };
+
+ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
+ {
+ typedef OP<T1, Tvec> op;
+ enum {step = Tvec::nlanes};
+
+ static inline void l(const T1* src1, const T1* src2, uchar* dst)
+ {
+ Tvec c0 = op::r(vx_load(src1), vx_load(src2));
+ Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
+ v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
+ }
+ };
+
+ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
+ {
+ typedef OP<T1, Tvec> op;
+ enum {step = Tvec::nlanes};
+
+ static inline void l(const T1* src1, const T1* src2, uchar* dst)
+ {
+ v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
+ v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
+ v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
+ v_uint32 c3 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
+ v_store(dst, v_pack_b(c0, c1, c2, c3));
+ }
+ };
+
+ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
+ {
+ typedef OP<T1, Tvec> op;
+ enum {step = Tvec::nlanes};
+
+ static inline void l(const T1* src1, const T1* src2, uchar* dst)
+ {
+ v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
+ v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
+ v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
+ v_uint64 c3 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
+
+ v_uint64 c4 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 4), vx_load(src2 + step * 4)));
+ v_uint64 c5 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 5), vx_load(src2 + step * 5)));
+ v_uint64 c6 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 6), vx_load(src2 + step * 6)));
+ v_uint64 c7 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 7), vx_load(src2 + step * 7)));
+ v_store(dst, v_pack_b(c0, c1, c2, c3, c4, c5, c6, c7));
+ }
+ };
+
+ #endif // CV_SIMD
+
+ //////////////////////////// Loops /////////////////////////////////
+
+ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+ {
+ typedef OP<T1, Tvec> op;
+ #if CV_SIMD
+ typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
+ enum {wide_step = Tvec::nlanes * sizeof(T1)};
+ #endif // CV_SIMD
+
+ step1 /= sizeof(T1);
+ step2 /= sizeof(T1);
+
+ for (; height--; src1 += step1, src2 += step2, dst += step)
+ {
+ int x = 0;
+
+ #if CV_SIMD
+ for (; x <= width - wide_step; x += wide_step)
+ {
+ ldr::l(src1 + x, src2 + x, dst + x);
+ }
+ #endif // CV_SIMD
+
+ #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+ for (; x <= width - 4; x += 4)
+ {
+ uchar t0 = op::r(src1[x], src2[x]);
+ uchar t1 = op::r(src1[x + 1], src2[x + 1]);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], src2[x + 2]);
+ t1 = op::r(src1[x + 3], src2[x + 3]);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+ #endif
+
+ for (; x < width; x++)
+ dst[x] = op::r(src1[x], src2[x]);
+ }
+
+ vx_cleanup();
+ }
+
+ template<typename T1, typename Tvec>
+ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ uchar* dst, size_t step, int width, int height, int cmpop)
+ {
+ switch(cmpop)
+ {
+ case CMP_LT:
+ cmp_loop<op_cmplt, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ case CMP_GT:
+ cmp_loop<op_cmplt, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
+ break;
+ case CMP_LE:
+ cmp_loop<op_cmple, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ case CMP_GE:
+ cmp_loop<op_cmple, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
+ break;
+ case CMP_EQ:
+ cmp_loop<op_cmpeq, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ default:
+ CV_Assert(cmpop == CMP_NE);
+ cmp_loop<op_cmpne, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ }
+ }
+
+ #if !CV_SIMD_64F
+ template< template<typename T1, typename Tvec> class OP, typename T1>
+ static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+ {
+ typedef OP<T1, v_int32 /*dummy*/> op;
+
+ step1 /= sizeof(T1);
+ step2 /= sizeof(T1);
+
+ for (; height--; src1 += step1, src2 += step2, dst += step)
+ {
+ int x = 0;
+
+ for (; x <= width - 4; x += 4)
+ {
+ uchar t0 = op::r(src1[x], src2[x]);
+ uchar t1 = op::r(src1[x + 1], src2[x + 1]);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], src2[x + 2]);
+ t1 = op::r(src1[x + 3], src2[x + 3]);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+
+ for (; x < width; x++)
+ dst[x] = op::r(src1[x], src2[x]);
+ }
+ }
+ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2, size_t step2,
+ uchar* dst, size_t step, int width, int height, int cmpop)
+ {
+ switch(cmpop)
+ {
+ case CMP_LT:
+ cmp_loop_nosimd<op_cmplt, double>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ case CMP_GT:
+ cmp_loop_nosimd<op_cmplt, double>(src2, step2, src1, step1, dst, step, width, height);
+ break;
+ case CMP_LE:
+ cmp_loop_nosimd<op_cmple, double>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ case CMP_GE:
+ cmp_loop_nosimd<op_cmple, double>(src2, step2, src1, step1, dst, step, width, height);
+ break;
+ case CMP_EQ:
+ cmp_loop_nosimd<op_cmpeq, double>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ default:
+ CV_Assert(cmpop == CMP_NE);
+ cmp_loop_nosimd<op_cmpne, double>(src1, step1, src2, step2, dst, step, width, height);
+ break;
+ }
+ }
+ #endif // !CV_SIMD_64F
+
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ /////////////////////////////////////////////////////////////////////////////////////////////
+
+ #ifndef SIMD_GUARD
+ #define CMP_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+ uchar* dst, size_t step, int width, int height
+
+ #define CMP_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+ #endif // SIMD_GUARD
+
+ #undef DECLARE_SIMD_FUN
+ #define DECLARE_SIMD_FUN(fun, _T1) void fun(CMP_ARGS(_T1), int cmpop);
+
+ #undef DISPATCH_SIMD_FUN
+ #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
+ void fun(CMP_ARGS(_T1), void* _cmpop) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \
+ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \
+ CV_CPU_DISPATCH(fun, (CMP_ARGS_PASS, *(int*)_cmpop), CV_CPU_DISPATCH_MODES_ALL); \
+ }
+
+ #undef DEFINE_SIMD_FUN
+ #define DEFINE_SIMD_FUN(fun, _T1, _Tvec, ...) \
+ void fun(CMP_ARGS(_T1), int cmpop) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ cmp_loop<_T1, _Tvec>(CMP_ARGS_PASS, cmpop); \
+ }
+
+ #undef DEFINE_NOSIMD_FUN
+ #define DEFINE_NOSIMD_FUN(fun, _T1, _Tvec, ...) \
+ void fun(CMP_ARGS(_T1), int cmpop) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ cmp_loop_nosimd(CMP_ARGS_PASS, cmpop); \
+ }
+
+ // todo: try to avoid define dispatcher functions using macros with these such cases
+ DEFINE_SIMD_ALL(cmp)
+
+ //=========================================================================
+ // scaling helpers for single and dual source
+ //
+ // Dual: Multiply, Div, AddWeighted
+ //
+ // Single: Reciprocal
+ //
+ //=========================================================================
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+
+ //////////////////////////// Loaders ///////////////////////////////
+
+ #if CV_SIMD
+ // todo: add support for RW alignment & stream
+ template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+ struct scalar_loader_n
+ {
+ void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst);
+ // single source
+ void l(const T1* src1, const T2* scalar, T1* dst);
+ };
+
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+ struct scalar_loader_n<sizeof(uchar), OP, T1, T2, Tvec>
+ {
+ typedef OP<T1, T2, v_int16> op;
+
+ static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
+ {
+ v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
+ v_int16 v_src2 = v_reinterpret_as_s16(vx_load_expand(src2));
+
+ v_int32 t0, t1, t2, t3;
+ v_expand(v_src1, t0, t2);
+ v_expand(v_src2, t1, t3);
+
+ v_float32 f0, f1, f2, f3;
+ f0 = v_cvt_f32(t0);
+ f1 = v_cvt_f32(t1);
+ f2 = v_cvt_f32(t2);
+ f3 = v_cvt_f32(t3);
+
+ f0 = op::r(f0, f1, scalar);
+ f2 = op::r(f2, f3, scalar);
+
+ v_int32 r0 = v_round(f0);
+ v_int32 r1 = v_round(f2);
+
+ store(dst, v_src2, r0, r1);
+ }
+
+ static inline void l(const T1* src1, const T2* scalar, T1* dst)
+ {
+ v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
+
+ v_int32 t0, t1;
+ v_expand(v_src1, t0, t1);
+
+ v_float32 f0, f1;
+ f0 = v_cvt_f32(t0);
+ f1 = v_cvt_f32(t1);
+
+ f0 = op::r(f0, scalar);
+ f1 = op::r(f1, scalar);
+
+ v_int32 r0 = v_round(f0);
+ v_int32 r1 = v_round(f1);
+
+ store(dst, v_src1, r0, r1);
+ }
+
+ static inline void store(uchar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
+ {
+ v_pack_u_store(dst, op::pre(src, v_pack(a, b)));
+ }
+ static inline void store(schar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
+ {
+ v_pack_store(dst, op::pre(src, v_pack(a, b)));
+ }
+ };
+
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+ struct scalar_loader_n<sizeof(ushort), OP, T1, T2, Tvec>
+ {
+ typedef typename V_RegTraits<Tvec>::w_reg Twvec;
+ typedef OP<T1, T2, Tvec> op;
+
+ static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
+ {
+ Tvec v_src1 = vx_load(src1);
+ Tvec v_src2 = vx_load(src2);
+
+ Twvec t0, t1, t2, t3;
+ v_expand(v_src1, t0, t2);
+ v_expand(v_src2, t1, t3);
+
+ v_float32 f0, f1, f2, f3;
+ f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+ f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+ f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+ f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+ f0 = op::r(f0, f1, scalar);
+ f2 = op::r(f2, f3, scalar);
+
+ v_int32 r0 = v_round(f0);
+ v_int32 r1 = v_round(f2);
+
+ store(dst, v_src2, r0, r1);
+ }
+
+ static inline void l(const T1* src1, const T2* scalar, T1* dst)
+ {
+ Tvec v_src1 = vx_load(src1);
+
+ Twvec t0, t1;
+ v_expand(v_src1, t0, t1);
+
+ v_float32 f0, f1;
+ f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+ f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+ f0 = op::r(f0, scalar);
+ f1 = op::r(f1, scalar);
+
+ v_int32 r0 = v_round(f0);
+ v_int32 r1 = v_round(f1);
+
+ store(dst, v_src1, r0, r1);
+ }
+
+ static inline void store(ushort* dst, const Tvec& src, const v_int32& a, const v_int32& b)
+ {
+ v_store(dst, op::pre(src, v_pack_u(a, b)));
+ }
+ static inline void store(short* dst, const Tvec& src, const v_int32& a, const v_int32& b)
+ {
+ v_store(dst, op::pre(src, v_pack(a, b)));
+ }
+ };
+
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
+ struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
+ {
+ typedef OP<int, T2, v_int32> op;
+ enum {step = v_int32::nlanes};
+
+ static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
+ {
+ v_int32 v_src1 = vx_load(src1);
+ v_int32 v_src2 = vx_load(src2);
+ v_int32 v_src1s = vx_load(src1 + step);
+ v_int32 v_src2s = vx_load(src2 + step);
+
+ v_float32 f0, f1, f2, f3;
+ f0 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
+ f1 = v_cvt_f32(v_reinterpret_as_s32(v_src2));
+ f2 = v_cvt_f32(v_reinterpret_as_s32(v_src1s));
+ f3 = v_cvt_f32(v_reinterpret_as_s32(v_src2s));
+
+ f0 = op::r(f0, f1, scalar);
+ f2 = op::r(f2, f3, scalar);
+
+ v_int32 r0 = v_round(f0);
+ v_int32 r1 = v_round(f2);
+
+ r0 = op::pre(v_src2, r0);
+ r1 = op::pre(v_src2s, r1);
+
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+
+ static inline void l(const int* src1, const T2* scalar, int* dst)
+ {
+ v_int32 v_src1 = vx_load(src1);
+ v_int32 v_src1s = vx_load(src1 + step);
+
+ v_float32 f0, f1;
+ f0 = v_cvt_f32(v_src1);
+ f1 = v_cvt_f32(v_src1s);
+
+ f0 = op::r(f0, scalar);
+ f1 = op::r(f1, scalar);
+
+ v_int32 r0 = v_round(f0);
+ v_int32 r1 = v_round(f1);
+
+ r0 = op::pre(v_src1, r0);
+ r1 = op::pre(v_src1s, r1);
+
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+ };
+
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
+ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
+ {
+ typedef OP<float, T2, v_float32> op;
+ enum {step = v_float32::nlanes};
+
+ static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
+ {
+ v_float32 v_src1 = vx_load(src1);
+ v_float32 v_src2 = vx_load(src2);
+ v_float32 v_src1s = vx_load(src1 + step);
+ v_float32 v_src2s = vx_load(src2 + step);
+
+ v_float32 r0 = op::r(v_src1, v_src2, scalar);
+ v_float32 r1 = op::r(v_src1s, v_src2s, scalar);
+
- #if CV_VERSION_MAJOR == 3
- r0 = op::pre(v_src1, r0);
- r1 = op::pre(v_src1s, r1);
- #endif
-
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+
+ static inline void l(const float* src1, const T2* scalar, float* dst)
+ {
+ v_float32 v_src1 = vx_load(src1);
+ v_float32 v_src1s = vx_load(src1 + step);
+
+ v_float32 r0 = op::r(v_src1, scalar);
+ v_float32 r1 = op::r(v_src1s, scalar);
+
- #if CV_VERSION_MAJOR == 3
- r0 = op::pre(v_src2, r0);
- r1 = op::pre(v_src2s, r1);
- #endif
-
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+ };
+ #endif // CV_SIMD
+
+ #if CV_SIMD_64F
+ template<template<typename T1, typename T2, typename Tvec> class OP>
+ struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
+ {
+ typedef OP<int, float, v_int32> op;
+ typedef OP<double, double, v_float64> op64;
+ enum {step = v_int32::nlanes};
+
+ static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
+ {
+ v_int32 v_src1 = vx_load(src1);
+ v_int32 v_src2 = vx_load(src2);
+ v_int32 v_src1s = vx_load(src1 + step);
+ v_int32 v_src2s = vx_load(src2 + step);
+
+ v_int32 r0 = r(v_src1, v_src2, scalar);
+ v_int32 r1 = r(v_src1s, v_src2s, scalar);
+
+ r0 = op::pre(v_src2, r0);
+ r1 = op::pre(v_src2s, r1);
+
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+ static inline void l(const int* src1, const double* scalar, int* dst)
+ {
+ v_int32 v_src1 = vx_load(src1);
+ v_int32 v_src1s = vx_load(src1 + step);
+
+ v_int32 r0 = r(v_src1, scalar);
+ v_int32 r1 = r(v_src1s, scalar);
+
+ r0 = op::pre(v_src1, r0);
+ r1 = op::pre(v_src1s, r1);
+
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+
+ static inline v_int32 r(const v_int32& a, const v_int32& b, const double* scalar)
+ {
+ v_float64 f0, f1, f2, f3;
+ f0 = v_cvt_f64(a);
+ f1 = v_cvt_f64_high(a);
+ f2 = v_cvt_f64(b);
+ f3 = v_cvt_f64_high(b);
+
+ v_float64 r0 = op64::r(f0, f2, scalar);
+ v_float64 r1 = op64::r(f1, f3, scalar);
+
+ return v_round(r0, r1);
+ }
+ static inline v_int32 r(const v_int32& a, const double* scalar)
+ {
+ v_float64 f0, f1;
+ f0 = v_cvt_f64(a);
+ f1 = v_cvt_f64_high(a);
+
+ v_float64 r0 = op64::r(f0, scalar);
+ v_float64 r1 = op64::r(f1, scalar);
+
+ return v_round(r0, r1);
+ }
+ };
+
+ template<template<typename T1, typename T2, typename Tvec> class OP>
+ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
+ {
+ typedef OP<float, float, v_float32> op;
+ typedef OP<double, double, v_float64> op64;
+ enum {step = v_float32::nlanes};
+
+ static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
+ {
+ v_float32 v_src1 = vx_load(src1);
+ v_float32 v_src2 = vx_load(src2);
+ v_float32 v_src1s = vx_load(src1 + step);
+ v_float32 v_src2s = vx_load(src2 + step);
+
+ v_float32 r0 = r(v_src1, v_src2, scalar);
+ v_float32 r1 = r(v_src1s, v_src2s, scalar);
+
- #if CV_VERSION_MAJOR == 3
- r0 = op::pre(v_src1, r0);
- r1 = op::pre(v_src1s, r1);
- #endif
-
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+ static inline void l(const float* src1, const double* scalar, float* dst)
+ {
+ v_float32 v_src1 = vx_load(src1);
+ v_float32 v_src1s = vx_load(src1 + step);
+
+ v_float32 r0 = r(v_src1, scalar);
+ v_float32 r1 = r(v_src1s, scalar);
+
- #if CV_VERSION_MAJOR == 3
- r0 = op::pre(v_src2, r0);
- r1 = op::pre(v_src2s, r1);
- #endif
-
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+
+ static inline v_float32 r(const v_float32& a, const v_float32& b, const double* scalar)
+ {
+ v_float64 f0, f1, f2, f3;
+ f0 = v_cvt_f64(a);
+ f1 = v_cvt_f64_high(a);
+ f2 = v_cvt_f64(b);
+ f3 = v_cvt_f64_high(b);
+
+ v_float64 r0 = op64::r(f0, f2, scalar);
+ v_float64 r1 = op64::r(f1, f3, scalar);
+
+ return v_cvt_f32(r0, r1);
+ }
+ static inline v_float32 r(const v_float32& a, const double* scalar)
+ {
+ v_float64 f0, f1;
+ f0 = v_cvt_f64(a);
+ f1 = v_cvt_f64_high(a);
+
+ v_float64 r0 = op64::r(f0, scalar);
+ v_float64 r1 = op64::r(f1, scalar);
+
+ return v_cvt_f32(r0, r1);
+ }
+ };
+
+ template<template<typename T1, typename T2, typename Tvec> class OP>
+ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
+ {
+ typedef OP<double, double, v_float64> op;
+ enum {step = v_float64::nlanes};
+
+ static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
+ {
+ v_float64 v_src1 = vx_load(src1);
+ v_float64 v_src2 = vx_load(src2);
+ v_float64 v_src1s = vx_load(src1 + step);
+ v_float64 v_src2s = vx_load(src2 + step);
+
+ v_float64 r0 = op::r(v_src1, v_src2, scalar);
+ v_float64 r1 = op::r(v_src1s, v_src2s, scalar);
+
- #if CV_VERSION_MAJOR == 3
- r0 = op::pre(v_src1, r0);
- r1 = op::pre(v_src1s, r1);
- #endif
-
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+ static inline void l(const double* src1, const double* scalar, double* dst)
+ {
+ v_float64 v_src1 = vx_load(src1);
+ v_float64 v_src1s = vx_load(src1 + step);
+
+ v_float64 r0 = op::r(v_src1, scalar);
+ v_float64 r1 = op::r(v_src1s, scalar);
+
-#if CV_VERSION_MAJOR == 3
-template<typename T1, typename Tvec>
-struct op_div_f
-{
- static inline Tvec r(const Tvec& a, const Tvec& b)
- {
- const Tvec v_zero = Tvec();
- return v_select(b == v_zero, v_zero, a / b);
- }
- static inline T1 r(T1 a, T1 b)
- { return b != (T1)0 ? a / b : (T1)0; }
-};
-#else
+ v_store(dst, r0);
+ v_store(dst + step, r1);
+ }
+ };
+ #endif // CV_SIMD_64F
+
+ //////////////////////////// Loops /////////////////////////////////
+
+ // dual source
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ T1* dst, size_t step, int width, int height, const T2* scalar)
+ {
+ typedef OP<T1, T2, Tvec> op;
+ #if CV_SIMD
+ typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
+ const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
+ sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+ #endif // CV_SIMD
+
+ step1 /= sizeof(T1);
+ step2 /= sizeof(T1);
+ step /= sizeof(T1);
+
+ for (; height--; src1 += step1, src2 += step2, dst += step)
+ {
+ int x = 0;
+
+ #if CV_SIMD
+ for (; x <= width - wide_step; x += wide_step)
+ {
+ ldr::l(src1 + x, src2 + x, scalar, dst + x);
+ }
+ #endif // CV_SIMD
+
+ #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+ for (; x <= width - 4; x += 4)
+ {
+ T1 t0 = op::r(src1[x], src2[x], scalar);
+ T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], src2[x + 2], scalar);
+ t1 = op::r(src1[x + 3], src2[x + 3], scalar);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+ #endif
+
+ for (; x < width; ++x)
+ dst[x] = op::r(src1[x], src2[x], scalar);
+ }
+
+ vx_cleanup();
+ }
+
+ // single source
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
+ {
+ typedef OP<T1, T2, Tvec> op;
+ #if CV_SIMD
+ typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
+ const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
+ sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+ #endif // CV_SIMD
+
+ step1 /= sizeof(T1);
+ step /= sizeof(T1);
+
+ for (; height--; src1 += step1, dst += step)
+ {
+ int x = 0;
+
+ #if CV_SIMD
+ for (; x <= width - wide_step; x += wide_step)
+ {
+ ldr::l(src1 + x, scalar, dst + x);
+ }
+ #endif // CV_SIMD
+
+ #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+ for (; x <= width - 4; x += 4)
+ {
+ T1 t0 = op::r(src1[x], scalar);
+ T1 t1 = op::r(src1[x + 1], scalar);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], scalar);
+ t1 = op::r(src1[x + 3], scalar);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+ #endif
+
+ for (; x < width; ++x)
+ dst[x] = op::r(src1[x], scalar);
+ }
+
+ vx_cleanup();
+ }
+
+ #if !CV_SIMD_64F
+ // dual source
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+ static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ T1* dst, size_t step, int width, int height, const T2* scalar)
+ {
+ typedef OP<T1, T2, Tvec> op;
+
+ step1 /= sizeof(T1);
+ step2 /= sizeof(T1);
+ step /= sizeof(T1);
+
+ for (; height--; src1 += step1, src2 += step2, dst += step)
+ {
+ int x = 0;
+
+ for (; x <= width - 4; x += 4)
+ {
+ T1 t0 = op::r(src1[x], src2[x], scalar);
+ T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], src2[x + 2], scalar);
+ t1 = op::r(src1[x + 3], src2[x + 3], scalar);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+
+ for (; x < width; ++x)
+ dst[x] = op::r(src1[x], src2[x], scalar);
+ }
+ }
+
+ // single source
+ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
+ {
+ typedef OP<T1, T2, Tvec> op;
+
+ step1 /= sizeof(T1);
+ step /= sizeof(T1);
+
+ for (; height--; src1 += step1, dst += step)
+ {
+ int x = 0;
+
+ for (; x <= width - 4; x += 4)
+ {
+ T1 t0 = op::r(src1[x], scalar);
+ T1 t1 = op::r(src1[x + 1], scalar);
+ dst[x] = t0; dst[x + 1] = t1;
+
+ t0 = op::r(src1[x + 2], scalar);
+ t1 = op::r(src1[x + 3], scalar);
+ dst[x + 2] = t0; dst[x + 3] = t1;
+ }
+
+ for (; x < width; ++x)
+ dst[x] = op::r(src1[x], scalar);
+ }
+ }
+
+ #define SCALAR_LOOP64F scalar_loop_nosimd
+ #else
+ #define SCALAR_LOOP64F scalar_loop
+ #endif // !CV_SIMD_64F
+
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ //=========================================================================
+ // Multiply
+ //=========================================================================
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+
+ ///////////////////////////// Operations //////////////////////////////////
+
+ template<typename T1, typename Tvec>
+ struct op_mul
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a * b; }
+ static inline T1 r(T1 a, T1 b)
+ { return saturate_cast<T1>(a * b); }
+ };
+
+ template<typename T1, typename T2, typename Tvec>
+ struct op_mul_scale
+ {
+ static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+ {
+ const v_float32 v_scalar = vx_setall_f32(*scalar);
+ return v_scalar * a * b;
+ }
+ static inline T1 r(T1 a, T1 b, const T2* scalar)
+ { return c_mul(a, b, *scalar); }
+ static inline Tvec pre(const Tvec&, const Tvec& res)
+ { return res; }
+ };
+
+ template<>
+ struct op_mul_scale<double, double, v_float64>
+ {
+ #if CV_SIMD_64F
+ static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+ {
+ const v_float64 v_scalar = vx_setall_f64(*scalar);
+ return v_scalar * a * b;
+ }
+ #endif
+ static inline double r(double a, double b, const double* scalar)
+ { return c_mul(a, b, *scalar); }
+ static inline v_float64 pre(const v_float64&, const v_float64& res)
+ { return res; }
+ };
+
+ //////////////////////////// Loops /////////////////////////////////
+
+ template<typename T1, typename Tvec>
+ static void mul_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ T1* dst, size_t step, int width, int height, const double* scalar)
+ {
+ float fscalar = (float)*scalar;
+ if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
+ {
+ bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+ }
+ else
+ {
+ scalar_loop<op_mul_scale, T1, float, Tvec>(src1, step1, src2, step2,
+ dst, step, width, height, &fscalar);
+ }
+ }
+
+ template<typename T1, typename Tvec>
+ static void mul_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ T1* dst, size_t step, int width, int height, const double* scalar)
+ {
+ if (std::fabs(*scalar - 1.0) <= FLT_EPSILON)
+ {
+ bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+ }
+ else
+ {
+ SCALAR_LOOP64F<op_mul_scale, T1, double, Tvec>(src1, step1, src2, step2,
+ dst, step, width, height, scalar);
+ }
+ }
+
+ template<>
+ void mul_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+ double* dst, size_t step, int width, int height, const double* scalar)
+ {
+ if (*scalar == 1.0)
+ {
+ BIN_LOOP64F<op_mul, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
+ }
+ else
+ {
+ SCALAR_LOOP64F<op_mul_scale, double, double, v_float64>(src1, step1, src2, step2,
+ dst, step, width, height, scalar);
+ }
+ }
+
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ //////////////////////////////////////////////////////////////////////////
+
+ #undef SCALAR_ARGS
+ #define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+ _T1* dst, size_t step, int width, int height
+
+ #undef SCALAR_ARGS_PASS
+ #define SCALAR_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+
+ #undef DECLARE_SIMD_FUN
+ #define DECLARE_SIMD_FUN(fun, _T1) void fun(SCALAR_ARGS(_T1), const double* scalar);
+
+ #undef DISPATCH_SIMD_FUN
+ #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
+ void fun(SCALAR_ARGS(_T1), void* scalar) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \
+ SCALAR_ARGS_PASS, *(const double*)scalar) \
+ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \
+ SCALAR_ARGS_PASS, *(const double*)scalar) \
+ CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \
+ CV_CPU_DISPATCH_MODES_ALL); \
+ }
+
+ #undef DEFINE_SIMD_FUN
+ #define DEFINE_SIMD_FUN(fun, _T1, _Tvec, op) \
+ void fun(SCALAR_ARGS(_T1), const double* scalar) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ op<_T1, _Tvec>(SCALAR_ARGS_PASS, scalar); \
+ }
+
+ #undef DEFINE_NOSIMD_FUN
+ #define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \
+ DEFINE_SIMD_FUN(fun, _T1, v_float64, _OP)
+
+ DEFINE_SIMD_SAT(mul, mul_loop)
+ DEFINE_SIMD_F32(mul, mul_loop_d)
+ DEFINE_SIMD_S32(mul, mul_loop_d)
+ DEFINE_SIMD_F64(mul, mul_loop_d)
+
+ //=========================================================================
+ // Div
+ //=========================================================================
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+
+ ///////////////////////////// Operations //////////////////////////////////
+
-#endif
+ template<typename T1, typename Tvec>
+ struct op_div_f
+ {
+ static inline Tvec r(const Tvec& a, const Tvec& b)
+ { return a / b; }
+ static inline T1 r(T1 a, T1 b)
+ { return a / b; }
+ };
- { return denom != (T1)0 ? c_div(a, denom, *scalar) : (T1)0; }
+
+ template<typename T1, typename T2, typename Tvec>
+ struct op_div_scale
+ {
+ static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+ {
+ const v_float32 v_scalar = vx_setall_f32(*scalar);
+ return a * v_scalar / b;
+ }
+ static inline Tvec pre(const Tvec& denom, const Tvec& res)
+ {
+ const Tvec v_zero = Tvec();
+ return v_select(denom == v_zero, v_zero, res);
+ }
+ static inline T1 r(T1 a, T1 denom, const T2* scalar)
- static inline v_float64 pre(const v_float64& denom, const v_float64& res)
- {
- const v_float64 v_zero = vx_setzero_f64();
- return v_select(denom == v_zero, v_zero, res);
- }
++ {
++ CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
++ return denom != (T1)0 ? c_div(a, denom, *scalar) : (T1)0;
++ }
++};
++
++template<>
++struct op_div_scale<float, float, v_float32>
++{
++ static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
++ {
++ const v_float32 v_scalar = vx_setall_f32(*scalar);
++ return a * v_scalar / b;
++ }
++ static inline float r(float a, float denom, const float* scalar)
++ { return c_div(a, denom, *scalar); }
+ };
+
+ template<>
+ struct op_div_scale<double, double, v_float64>
+ {
+ #if CV_SIMD_64F
+ static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+ {
+ const v_float64 v_scalar = vx_setall_f64(*scalar);
+ return a * v_scalar / b;
+ }
- { return denom != 0.0 ? c_div(a, denom, *scalar) : 0.0; }
+ #endif
+ static inline double r(double a, double denom, const double* scalar)
- { return denom != (T1)0 ? c_div(*scalar, denom) : (T1)0; }
++ { return c_div(a, denom, *scalar); }
+ };
+
+ //////////////////////////// Loops /////////////////////////////////
+
+ template<typename T1, typename Tvec>
+ static void div_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ T1* dst, size_t step, int width, int height, const double* scalar)
+ {
+ float fscalar = (float)*scalar;
+ // todo: add new intrinsics for integer divide
+ scalar_loop<op_div_scale, T1, float, Tvec>(src1, step1, src2, step2,
+ dst, step, width, height, &fscalar);
+ }
+
+ template<>
+ void div_loop<float, v_float32>(const float* src1, size_t step1, const float* src2, size_t step2,
+ float* dst, size_t step, int width, int height, const double* scalar)
+ {
+ float fscalar = (float)*scalar;
+ if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
+ {
+ bin_loop<op_div_f, float, v_float32>(src1, step1, src2, step2, dst, step, width, height);
+ }
+ else
+ {
+ SCALAR_LOOP64F<op_div_scale, float, float, v_float32>(src1, step1, src2, step2,
+ dst, step, width, height, &fscalar);
+ }
+ }
+
+ template<>
+ void div_loop<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+ double* dst, size_t step, int width, int height, const double* scalar)
+ {
+ if (*scalar == 1.0)
+ {
+ BIN_LOOP64F<op_div_f, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
+ }
+ else
+ {
+ SCALAR_LOOP64F<op_div_scale, double, double, v_float64>(src1, step1, src2, step2,
+ dst, step, width, height, scalar);
+ }
+ }
+
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ //////////////////////////////////////////////////////////////////////////
+
+ DEFINE_SIMD_ALL(div, div_loop)
+
+ //=========================================================================
+ // AddWeighted
+ //=========================================================================
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+
+ ///////////////////////////// Operations //////////////////////////////////
+
+ ///// Add scale
+ template<typename T1, typename T2, typename Tvec>
+ struct op_add_scale
+ {
+ static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+ {
+ const v_float32 v_alpha = vx_setall_f32(*scalar);
+ return v_fma(a, v_alpha, b);
+ }
+ static inline T1 r(T1 a, T1 b, const T2* scalar)
+ { return c_add(a, b, *scalar); }
+ static inline Tvec pre(const Tvec&, const Tvec& res)
+ { return res; }
+ };
+
+ template<>
+ struct op_add_scale<double, double, v_float64>
+ {
+ #if CV_SIMD_64F
+ static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+ {
+ const v_float64 v_alpha = vx_setall_f64(*scalar);
+ return v_fma(a, v_alpha, b);
+ }
+ #endif
+ static inline double r(double a, double b, const double* scalar)
+ { return c_add(a, b, *scalar); }
+ static inline v_float64 pre(const v_float64&, const v_float64& res)
+ { return res; }
+ };
+
+ ///// Weighted sum
+ template<typename T1, typename T2, typename Tvec>
+ struct op_add_weighted
+ {
+ static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
+ {
+ const v_float32 v_alpha = vx_setall_f32(scalars[0]);
+ const v_float32 v_beta = vx_setall_f32(scalars[1]);
+ const v_float32 v_gamma = vx_setall_f32(scalars[2]);
+ return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
+ }
+ static inline T1 r(T1 a, T1 b, const T2* scalars)
+ { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
+ static inline Tvec pre(const Tvec&, const Tvec& res)
+ { return res; }
+ };
+
+ template<>
+ struct op_add_weighted<double, double, v_float64>
+ {
+ #if CV_SIMD_64F
+ static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
+ {
+ const v_float64 v_alpha = vx_setall_f64(scalars[0]);
+ const v_float64 v_beta = vx_setall_f64(scalars[1]);
+ const v_float64 v_gamma = vx_setall_f64(scalars[2]);
+ return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
+ }
+ #endif
+ static inline double r(double a, double b, const double* scalars)
+ { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
+ static inline v_float64 pre(const v_float64&, const v_float64& res)
+ { return res; }
+ };
+
+ //////////////////////////// Loops /////////////////////////////////
+
+ template<typename T1, typename Tvec>
+ static void add_weighted_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ T1* dst, size_t step, int width, int height, const double* scalars)
+ {
+ float fscalars[] = {(float)scalars[0], (float)scalars[1], (float)scalars[2]};
+ if (fscalars[1] == 1.0f && fscalars[2] == 0.0f)
+ {
+ scalar_loop<op_add_scale, T1, float, Tvec>(src1, step1, src2, step2,
+ dst, step, width, height, fscalars);
+ }
+ else
+ {
+ scalar_loop<op_add_weighted, T1, float, Tvec>(src1, step1, src2, step2,
+ dst, step, width, height, fscalars);
+ }
+ }
+
+ template<typename T1, typename Tvec>
+ static void add_weighted_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
+ T1* dst, size_t step, int width, int height, const double* scalars)
+ {
+ if (scalars[1] == 1.0 && scalars[2] == 0.0)
+ {
+ SCALAR_LOOP64F<op_add_scale, T1, double, Tvec>(src1, step1, src2, step2,
+ dst, step, width, height, scalars);
+ }
+ else
+ {
+ SCALAR_LOOP64F<op_add_weighted, T1, double, Tvec>(src1, step1, src2, step2,
+ dst, step, width, height, scalars);
+ }
+ }
+
+ template<>
+ void add_weighted_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+ double* dst, size_t step, int width, int height, const double* scalars)
+ {
+ if (scalars[1] == 1.0 && scalars[2] == 0.0)
+ {
+ SCALAR_LOOP64F<op_add_scale, double, double, v_float64>(src1, step1, src2, step2,
+ dst, step, width, height, scalars);
+ }
+ else
+ {
+ SCALAR_LOOP64F<op_add_weighted, double, double, v_float64>(src1, step1, src2, step2,
+ dst, step, width, height, scalars);
+ }
+ }
+
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ //////////////////////////////////////////////////////////////////////////
+
+ #undef DISPATCH_SIMD_FUN
+ #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
+ void fun(SCALAR_ARGS(_T1), void* scalar) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \
+ SCALAR_ARGS_PASS, (const double*)scalar) \
+ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \
+ SCALAR_ARGS_PASS, (const double*)scalar) \
+ CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \
+ CV_CPU_DISPATCH_MODES_ALL); \
+ }
+
+ DEFINE_SIMD_SAT(addWeighted, add_weighted_loop)
+ DEFINE_SIMD_S32(addWeighted, add_weighted_loop_d)
+ DEFINE_SIMD_F32(addWeighted, add_weighted_loop_d)
+ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
+
+ //=======================================
+ // Reciprocal
+ //=======================================
+
+ #ifdef ARITHM_DEFINITIONS_ONLY
+
+ ///////////////////////////// Operations //////////////////////////////////
+
+ template<typename T1, typename T2, typename Tvec>
+ struct op_recip
+ {
+ static inline v_float32 r(const v_float32& a, const T2* scalar)
+ {
+ const v_float32 v_scalar = vx_setall_f32(*scalar);
+ return v_scalar / a;
+ }
+ static inline Tvec pre(const Tvec& denom, const Tvec& res)
+ {
+ const Tvec v_zero = Tvec();
+ return v_select(denom == v_zero, v_zero, res);
+ }
+ static inline T1 r(T1 denom, const T2* scalar)
- static inline v_float64 pre(const v_float64& denom, const v_float64& res)
- {
- const v_float64 v_zero = vx_setzero_f64();
- return v_select(denom == v_zero, v_zero, res);
- }
++ {
++ CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
++ return denom != (T1)0 ? c_div(*scalar, denom) : (T1)0;
++ }
++};
++
++template<>
++struct op_recip<float, float, v_float32>
++{
++ static inline v_float32 r(const v_float32& a, const float* scalar)
++ {
++ const v_float32 v_scalar = vx_setall_f32(*scalar);
++ return v_scalar / a;
++ }
++ static inline float r(float denom, const float* scalar)
++ { return c_div(*scalar, denom); }
+ };
+
+ template<>
+ struct op_recip<double, double, v_float64>
+ {
+ #if CV_SIMD_64F
+ static inline v_float64 r(const v_float64& a, const double* scalar)
+ {
+ const v_float64 v_scalar = vx_setall_f64(*scalar);
+ return v_scalar / a;
+ }
- { return denom != 0.0 ? c_div(*scalar, denom) : 0.0; }
+ #endif
+ static inline double r(double denom, const double* scalar)
++ { return c_div(*scalar, denom); }
+ };
+
+ //////////////////////////// Loops /////////////////////////////////
+
+ template<typename T1, typename Tvec>
+ static void recip_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const double* scalar)
+ {
+ float fscalar = (float)*scalar;
+ scalar_loop<op_recip, T1, float, Tvec>(src1, step1, dst, step, width, height, &fscalar);
+ }
+
+ template<>
+ void recip_loop<double, v_float64>(const double* src1, size_t step1, double* dst, size_t step, int width, int height, const double* scalar)
+ {
+ SCALAR_LOOP64F<op_recip, double, double, v_float64>(src1, step1, dst, step, width, height, scalar);
+ }
+
+ #endif // ARITHM_DEFINITIONS_ONLY
+
+ //////////////////////////////////////////////////////////////////////////
+
+ #undef SCALAR_ARGS
+ #define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, _T1* dst, size_t step, int width, int height
+
+ #undef SCALAR_ARGS_PASS
+ #define SCALAR_ARGS_PASS src1, step1, dst, step, width, height
+
+ #undef DISPATCH_SIMD_FUN
+ #define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
+ void fun(const _T1*, size_t, SCALAR_ARGS(_T1), void* scalar) \
+ { \
+ CV_INSTRUMENT_REGION(); \
+ CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \
+ SCALAR_ARGS_PASS, *(const double*)scalar) \
+ ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \
+ SCALAR_ARGS_PASS, *(const double*)scalar) \
+ CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \
+ CV_CPU_DISPATCH_MODES_ALL); \
+ }
+
+ DEFINE_SIMD_ALL(recip, recip_loop)
+
+ #ifndef ARITHM_DISPATCHING_ONLY
+ CV_CPU_OPTIMIZATION_NAMESPACE_END
+ #endif
+
+ #ifndef SIMD_GUARD
+ #define SIMD_GUARD
+ #endif
+
+ }} // cv::hal::
--- /dev/null
-#if defined(HAVE_IPP)
+ /*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ // By downloading, copying, installing or using the software you agree to this license.
+ // If you do not agree to this license, do not download, install,
+ // copy or use the software.
+ //
+ //
+ // License Agreement
+ // For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved.
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+ // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ // * Redistribution's of source code must retain the above copyright notice,
+ // this list of conditions and the following disclaimer.
+ //
+ // * Redistribution's in binary form must reproduce the above copyright notice,
+ // this list of conditions and the following disclaimer in the documentation
+ // and/or other materials provided with the distribution.
+ //
+ // * The name of the copyright holders may not be used to endorse or promote products
+ // derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+
+ #include "precomp.hpp"
+
+ #include <vector>
+
+ #include "opencv2/core/hal/intrin.hpp"
+ #include "opencl_kernels_imgproc.hpp"
+
+ #include "opencv2/core/openvx/ovx_defs.hpp"
+
+ namespace cv
+ {
+
+ /****************************************************************************************\
+ Box Filter
+ \****************************************************************************************/
+
+ template<typename T, typename ST>
+ struct RowSum :
+ public BaseRowFilter
+ {
+ RowSum( int _ksize, int _anchor ) :
+ BaseRowFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ }
+
+ virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
+ {
+ const T* S = (const T*)src;
+ ST* D = (ST*)dst;
+ int i = 0, k, ksz_cn = ksize*cn;
+
+ width = (width - 1)*cn;
+ if( ksize == 3 )
+ {
+ for( i = 0; i < width + cn; i++ )
+ {
+ D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2];
+ }
+ }
+ else if( ksize == 5 )
+ {
+ for( i = 0; i < width + cn; i++ )
+ {
+ D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4];
+ }
+ }
+ else if( cn == 1 )
+ {
+ ST s = 0;
+ for( i = 0; i < ksz_cn; i++ )
+ s += (ST)S[i];
+ D[0] = s;
+ for( i = 0; i < width; i++ )
+ {
+ s += (ST)S[i + ksz_cn] - (ST)S[i];
+ D[i+1] = s;
+ }
+ }
+ else if( cn == 3 )
+ {
+ ST s0 = 0, s1 = 0, s2 = 0;
+ for( i = 0; i < ksz_cn; i += 3 )
+ {
+ s0 += (ST)S[i];
+ s1 += (ST)S[i+1];
+ s2 += (ST)S[i+2];
+ }
+ D[0] = s0;
+ D[1] = s1;
+ D[2] = s2;
+ for( i = 0; i < width; i += 3 )
+ {
+ s0 += (ST)S[i + ksz_cn] - (ST)S[i];
+ s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
+ s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
+ D[i+3] = s0;
+ D[i+4] = s1;
+ D[i+5] = s2;
+ }
+ }
+ else if( cn == 4 )
+ {
+ ST s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+ for( i = 0; i < ksz_cn; i += 4 )
+ {
+ s0 += (ST)S[i];
+ s1 += (ST)S[i+1];
+ s2 += (ST)S[i+2];
+ s3 += (ST)S[i+3];
+ }
+ D[0] = s0;
+ D[1] = s1;
+ D[2] = s2;
+ D[3] = s3;
+ for( i = 0; i < width; i += 4 )
+ {
+ s0 += (ST)S[i + ksz_cn] - (ST)S[i];
+ s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
+ s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
+ s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3];
+ D[i+4] = s0;
+ D[i+5] = s1;
+ D[i+6] = s2;
+ D[i+7] = s3;
+ }
+ }
+ else
+ for( k = 0; k < cn; k++, S++, D++ )
+ {
+ ST s = 0;
+ for( i = 0; i < ksz_cn; i += cn )
+ s += (ST)S[i];
+ D[0] = s;
+ for( i = 0; i < width; i += cn )
+ {
+ s += (ST)S[i + ksz_cn] - (ST)S[i];
+ D[i+cn] = s;
+ }
+ }
+ }
+ };
+
+
+ template<typename ST, typename T>
+ struct ColumnSum :
+ public BaseColumnFilter
+ {
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ }
+
+ virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+ {
+ int i;
+ ST* SUM;
+ bool haveScale = scale != 1;
+ double _scale = scale;
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void*)SUM, 0, width*sizeof(ST));
+
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const ST* Sp = (const ST*)src[0];
+
+ for( i = 0; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const ST* Sp = (const ST*)src[0];
+ const ST* Sm = (const ST*)src[1-ksize];
+ T* D = (T*)dst;
+ if( haveScale )
+ {
+ for( i = 0; i <= width - 2; i += 2 )
+ {
+ ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
+ D[i] = saturate_cast<T>(s0*_scale);
+ D[i+1] = saturate_cast<T>(s1*_scale);
+ s0 -= Sm[i]; s1 -= Sm[i+1];
+ SUM[i] = s0; SUM[i+1] = s1;
+ }
+
+ for( ; i < width; i++ )
+ {
+ ST s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<T>(s0*_scale);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ else
+ {
+ for( i = 0; i <= width - 2; i += 2 )
+ {
+ ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
+ D[i] = saturate_cast<T>(s0);
+ D[i+1] = saturate_cast<T>(s1);
+ s0 -= Sm[i]; s1 -= Sm[i+1];
+ SUM[i] = s0; SUM[i+1] = s1;
+ }
+
+ for( ; i < width; i++ )
+ {
+ ST s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<T>(s0);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ dst += dststep;
+ }
+ }
+
+ double scale;
+ int sumCount;
+ std::vector<ST> sum;
+ };
+
+
+ template<>
+ struct ColumnSum<int, uchar> :
+ public BaseColumnFilter
+ {
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ }
+
+ virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+ {
+ int* SUM;
+ bool haveScale = scale != 1;
+ double _scale = scale;
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void*)SUM, 0, width*sizeof(int));
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ int i = 0;
+ #if CV_SIMD
+ for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+ {
+ v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+ {
+ v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ const int* Sm = (const int*)src[1-ksize];
+ uchar* D = (uchar*)dst;
+ if( haveScale )
+ {
+ int i = 0;
+ #if CV_SIMD
+ v_float32 _v_scale = vx_setall_f32((float)_scale);
+ for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+ v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
+ v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+
+ v_uint16 v_dst = v_pack(v_s0d, v_s01d);
+ v_pack_store(D + i, v_dst);
+
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+ }
+ #if CV_SIMD_WIDTH > 16
+ v_float32x4 v_scale = v_setall_f32((float)_scale);
+ for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+ v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
+ v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+
+ v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
+ v_pack_store(D + i, v_dst);
+
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<uchar>(s0*_scale);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ else
+ {
+ int i = 0;
+ #if CV_SIMD
+ for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+ v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
+ v_pack_store(D + i, v_dst);
+
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+ v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
+ v_pack_store(D + i, v_dst);
+
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<uchar>(s0);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ dst += dststep;
+ }
+ #if CV_SIMD
+ vx_cleanup();
+ #endif
+ }
+
+ double scale;
+ int sumCount;
+ std::vector<int> sum;
+ };
+
+
+ template<>
+ struct ColumnSum<ushort, uchar> :
+ public BaseColumnFilter
+ {
+ enum { SHIFT = 23 };
+
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ divDelta = 0;
+ divScale = 1;
+ if( scale != 1 )
+ {
+ int d = cvRound(1./scale);
+ double scalef = ((double)(1 << SHIFT))/d;
+ divScale = cvFloor(scalef);
+ scalef -= divScale;
+ divDelta = d/2;
+ if( scalef < 0.5 )
+ divDelta++;
+ else
+ divScale++;
+ }
+ }
+
+ virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+ {
+ const int ds = divScale;
+ const int dd = divDelta;
+ ushort* SUM;
+ const bool haveScale = scale != 1;
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void*)SUM, 0, width*sizeof(SUM[0]));
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const ushort* Sp = (const ushort*)src[0];
+ int i = 0;
+ #if CV_SIMD
+ for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+ {
+ v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
+ {
+ v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const ushort* Sp = (const ushort*)src[0];
+ const ushort* Sm = (const ushort*)src[1-ksize];
+ uchar* D = (uchar*)dst;
+ if( haveScale )
+ {
+ int i = 0;
+ #if CV_SIMD
+ v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
+ v_uint16 _dd8 = vx_setall_u16((ushort)dd);
+
+ for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
+ {
+ v_uint16 _sm0 = vx_load(Sm + i);
+ v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
+
+ v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
+ v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
+
+ v_uint32 _s00, _s01, _s10, _s11;
+
+ v_expand(_s0 + _dd8, _s00, _s01);
+ v_expand(_s1 + _dd8, _s10, _s11);
+
+ _s00 = v_shr<SHIFT>(_s00*_ds4);
+ _s01 = v_shr<SHIFT>(_s01*_ds4);
+ _s10 = v_shr<SHIFT>(_s10*_ds4);
+ _s11 = v_shr<SHIFT>(_s11*_ds4);
+
+ v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
+ v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
+
+ _s0 = v_sub_wrap(_s0, _sm0);
+ _s1 = v_sub_wrap(_s1, _sm1);
+
+ v_store(D + i, v_pack_u(r0, r1));
+ v_store(SUM + i, _s0);
+ v_store(SUM + i + v_uint16::nlanes, _s1);
+ }
+ #if CV_SIMD_WIDTH > 16
+ v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
+ v_uint16x8 dd8 = v_setall_u16((ushort)dd);
+
+ for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes )
+ {
+ v_uint16x8 _sm0 = v_load(Sm + i);
+ v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes);
+
+ v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
+ v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes));
+
+ v_uint32x4 _s00, _s01, _s10, _s11;
+
+ v_expand(_s0 + dd8, _s00, _s01);
+ v_expand(_s1 + dd8, _s10, _s11);
+
+ _s00 = v_shr<SHIFT>(_s00*ds4);
+ _s01 = v_shr<SHIFT>(_s01*ds4);
+ _s10 = v_shr<SHIFT>(_s10*ds4);
+ _s11 = v_shr<SHIFT>(_s11*ds4);
+
+ v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
+ v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
+
+ _s0 = v_sub_wrap(_s0, _sm0);
+ _s1 = v_sub_wrap(_s1, _sm1);
+
+ v_store(D + i, v_pack_u(r0, r1));
+ v_store(SUM + i, _s0);
+ v_store(SUM + i + v_uint16x8::nlanes, _s1);
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = (uchar)((s0 + dd)*ds >> SHIFT);
+ SUM[i] = (ushort)(s0 - Sm[i]);
+ }
+ }
+ else
+ {
+ int i = 0;
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<uchar>(s0);
+ SUM[i] = (ushort)(s0 - Sm[i]);
+ }
+ }
+ dst += dststep;
+ }
+ #if CV_SIMD
+ vx_cleanup();
+ #endif
+ }
+
+ double scale;
+ int sumCount;
+ int divDelta;
+ int divScale;
+ std::vector<ushort> sum;
+ };
+
+
+ template<>
+ struct ColumnSum<int, short> :
+ public BaseColumnFilter
+ {
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ }
+
+ virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+ {
+ int i;
+ int* SUM;
+ bool haveScale = scale != 1;
+ double _scale = scale;
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void*)SUM, 0, width*sizeof(int));
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ i = 0;
+ #if CV_SIMD
+ for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+ {
+ v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+ {
+ v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ const int* Sm = (const int*)src[1-ksize];
+ short* D = (short*)dst;
+ if( haveScale )
+ {
+ i = 0;
+ #if CV_SIMD
+ v_float32 _v_scale = vx_setall_f32((float)_scale);
+ for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+ v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
+ v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
+ v_store(D + i, v_pack(v_s0d, v_s01d));
+
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+ }
+ #if CV_SIMD_WIDTH > 16
+ v_float32x4 v_scale = v_setall_f32((float)_scale);
+ for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+ v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
+ v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
+ v_store(D + i, v_pack(v_s0d, v_s01d));
+
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<short>(s0*_scale);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ else
+ {
+ i = 0;
+ #if CV_SIMD
+ for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+ v_store(D + i, v_pack(v_s0, v_s01));
+
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+ v_store(D + i, v_pack(v_s0, v_s01));
+
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+ }
+ #endif
+ #endif
+
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<short>(s0);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ dst += dststep;
+ }
+ #if CV_SIMD
+ vx_cleanup();
+ #endif
+ }
+
+ double scale;
+ int sumCount;
+ std::vector<int> sum;
+ };
+
+
+ template<>
+ struct ColumnSum<int, ushort> :
+ public BaseColumnFilter
+ {
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ }
+
+ virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+ {
+ int* SUM;
+ bool haveScale = scale != 1;
+ double _scale = scale;
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void*)SUM, 0, width*sizeof(int));
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ int i = 0;
+ #if CV_SIMD
+ for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+ {
+ v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+ {
+ v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ const int* Sm = (const int*)src[1-ksize];
+ ushort* D = (ushort*)dst;
+ if( haveScale )
+ {
+ int i = 0;
+ #if CV_SIMD
+ v_float32 _v_scale = vx_setall_f32((float)_scale);
+ for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+ v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
+ v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+ v_store(D + i, v_pack(v_s0d, v_s01d));
+
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+ }
+ #if CV_SIMD_WIDTH > 16
+ v_float32x4 v_scale = v_setall_f32((float)_scale);
+ for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+ v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
+ v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+ v_store(D + i, v_pack(v_s0d, v_s01d));
+
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<ushort>(s0*_scale);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ else
+ {
+ int i = 0;
+ #if CV_SIMD
+ for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+ v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
+
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+ v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
+
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<ushort>(s0);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ dst += dststep;
+ }
+ #if CV_SIMD
+ vx_cleanup();
+ #endif
+ }
+
+ double scale;
+ int sumCount;
+ std::vector<int> sum;
+ };
+
+ template<>
+ struct ColumnSum<int, int> :
+ public BaseColumnFilter
+ {
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ }
+
+ virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+ {
+ int* SUM;
+ bool haveScale = scale != 1;
+ double _scale = scale;
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void*)SUM, 0, width*sizeof(int));
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ int i = 0;
+ #if CV_SIMD
+ for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+ {
+ v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+ {
+ v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ const int* Sm = (const int*)src[1-ksize];
+ int* D = (int*)dst;
+ if( haveScale )
+ {
+ int i = 0;
+ #if CV_SIMD
+ v_float32 _v_scale = vx_setall_f32((float)_scale);
+ for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
+
+ v_store(D + i, v_s0d);
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ v_float32x4 v_scale = v_setall_f32((float)_scale);
+ for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
+
+ v_store(D + i, v_s0d);
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = saturate_cast<int>(s0*_scale);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ else
+ {
+ int i = 0;
+ #if CV_SIMD
+ for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+
+ v_store(D + i, v_s0);
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+
+ v_store(D + i, v_s0);
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = s0;
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ dst += dststep;
+ }
+ #if CV_SIMD
+ vx_cleanup();
+ #endif
+ }
+
+ double scale;
+ int sumCount;
+ std::vector<int> sum;
+ };
+
+
+ template<>
+ struct ColumnSum<int, float> :
+ public BaseColumnFilter
+ {
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ }
+
+ virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+ {
+ int* SUM;
+ bool haveScale = scale != 1;
+ double _scale = scale;
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void*)SUM, 0, width*sizeof(int));
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ int i = 0;
+ #if CV_SIMD
+ for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+ {
+ v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+ {
+ v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+ }
+ #endif
+ #endif
+
+ for( ; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const int * Sp = (const int*)src[0];
+ const int * Sm = (const int*)src[1-ksize];
+ float* D = (float*)dst;
+ if( haveScale )
+ {
+ int i = 0;
+
+ #if CV_SIMD
+ v_float32 _v_scale = vx_setall_f32((float)_scale);
+ for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ v_float32x4 v_scale = v_setall_f32((float)_scale);
+ for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_store(D + i, v_cvt_f32(v_s0) * v_scale);
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = (float)(s0*_scale);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ else
+ {
+ int i = 0;
+
+ #if CV_SIMD
+ for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+ {
+ v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+ v_store(D + i, v_cvt_f32(v_s0));
+ v_store(SUM + i, v_s0 - vx_load(Sm + i));
+ }
+ #if CV_SIMD_WIDTH > 16
+ for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+ {
+ v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+ v_store(D + i, v_cvt_f32(v_s0));
+ v_store(SUM + i, v_s0 - v_load(Sm + i));
+ }
+ #endif
+ #endif
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = (float)(s0);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ dst += dststep;
+ }
+ #if CV_SIMD
+ vx_cleanup();
+ #endif
+ }
+
+ double scale;
+ int sumCount;
+ std::vector<int> sum;
+ };
+
+ #ifdef HAVE_OPENCL
+
+ static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth,
+ Size ksize, Point anchor, int borderType, bool normalize )
+ {
+ const ocl::Device & dev = ocl::Device::getDefault();
+ int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+ if (ddepth < 0)
+ ddepth = sdepth;
+
+ if (anchor.x < 0)
+ anchor.x = ksize.width / 2;
+ if (anchor.y < 0)
+ anchor.y = ksize.height / 2;
+
+ if ( !(dev.isIntel() && (type == CV_8UC1) &&
+ (_src.offset() == 0) && (_src.step() % 4 == 0) &&
+ (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) &&
+ (anchor.x == 1) && (anchor.y == 1) &&
+ (ksize.width == 3) && (ksize.height == 3)) )
+ return false;
+
+ float alpha = 1.0f / (ksize.height * ksize.width);
+ Size size = _src.size();
+ size_t globalsize[2] = { 0, 0 };
+ size_t localsize[2] = { 0, 0 };
+ const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
+
+ globalsize[0] = size.width / 16;
+ globalsize[1] = size.height / 2;
+
+ char build_opts[1024];
+ sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
+
+ ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts);
+ if (kernel.empty())
+ return false;
+
+ UMat src = _src.getUMat();
+ _dst.create(size, CV_MAKETYPE(ddepth, cn));
+ if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
+ return false;
+ UMat dst = _dst.getUMat();
+
+ int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
+ idxArg = kernel.set(idxArg, (int)src.step);
+ idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
+ idxArg = kernel.set(idxArg, (int)dst.step);
+ idxArg = kernel.set(idxArg, (int)dst.rows);
+ idxArg = kernel.set(idxArg, (int)dst.cols);
+ if (normalize)
+ idxArg = kernel.set(idxArg, (float)alpha);
+
+ return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
+ }
+
+ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
+ Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
+ {
+ const ocl::Device & dev = ocl::Device::getDefault();
+ int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
+ bool doubleSupport = dev.doubleFPConfig() > 0;
+
+ if (ddepth < 0)
+ ddepth = sdepth;
+
+ if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
+ _src.offset() % esz != 0 || _src.step() % esz != 0)
+ return false;
+
+ if (anchor.x < 0)
+ anchor.x = ksize.width / 2;
+ if (anchor.y < 0)
+ anchor.y = ksize.height / 2;
+
+ int computeUnits = ocl::Device::getDefault().maxComputeUnits();
+ float alpha = 1.0f / (ksize.height * ksize.width);
+ Size size = _src.size(), wholeSize;
+ bool isolated = (borderType & BORDER_ISOLATED) != 0;
+ borderType &= ~BORDER_ISOLATED;
+ int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
+ wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
+
+ const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
+ size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
+ size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
+
+ UMat src = _src.getUMat();
+ if (!isolated)
+ {
+ Point ofs;
+ src.locateROI(wholeSize, ofs);
+ }
+
+ int h = isolated ? size.height : wholeSize.height;
+ int w = isolated ? size.width : wholeSize.width;
+
+ size_t maxWorkItemSizes[32];
+ ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
+ int tryWorkItems = (int)maxWorkItemSizes[0];
+
+ ocl::Kernel kernel;
+
+ if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
+ ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
+ (ksize.width == 5 && ksize.height == 5 && cn == 1)))
+ {
+ if (w < ksize.width || h < ksize.height)
+ return false;
+
+ // Figure out what vector size to use for loading the pixels.
+ int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
+ int pxLoadVecSize = cn * pxLoadNumPixels;
+
+ // Figure out how many pixels per work item to compute in X and Y
+ // directions. Too many and we run out of registers.
+ int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
+ if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
+ {
+ pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
+ pxPerWorkItemY = size.height % 2 ? 1 : 2;
+ }
+ else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
+ {
+ pxPerWorkItemX = size.width % 2 ? 1 : 2;
+ pxPerWorkItemY = size.height % 2 ? 1 : 2;
+ }
+ globalsize[0] = size.width / pxPerWorkItemX;
+ globalsize[1] = size.height / pxPerWorkItemY;
+
+ // Need some padding in the private array for pixels
+ int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
+
+ // Make the global size a nice round number so the runtime can pick
+ // from reasonable choices for the workgroup size
+ const int wgRound = 256;
+ globalsize[0] = roundUp(globalsize[0], wgRound);
+
+ char build_options[1024], cvt[2][40];
+ sprintf(build_options, "-D cn=%d "
+ "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
+ "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
+ "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
+ "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
+ "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
+ "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
+ cn, anchor.x, anchor.y, ksize.width, ksize.height,
+ pxLoadVecSize, pxLoadNumPixels,
+ pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
+ isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
+ privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
+ ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
+ ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
+ ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
+ ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
+ normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
+ ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
+ );
+
+
+ if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
+ return false;
+ }
+ else
+ {
+ localsize = localsize_general;
+ for ( ; ; )
+ {
+ int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
+
+ while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
+ BLOCK_SIZE_X /= 2;
+ while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
+ BLOCK_SIZE_Y *= 2;
+
+ if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
+ return false;
+
+ char cvt[2][50];
+ String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
+ " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
+ " -D ST1=%s -D DT1=%s -D cn=%d",
+ BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
+ ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+ ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
+ ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
+ anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
+ isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+ normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
+ ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
+
+ localsize[0] = BLOCK_SIZE_X;
+ globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
+ globalsize[1] = divUp(size.height, BLOCK_SIZE_Y);
+
+ kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
+ if (kernel.empty())
+ return false;
+
+ size_t kernelWorkGroupSize = kernel.workGroupSize();
+ if (localsize[0] <= kernelWorkGroupSize)
+ break;
+ if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
+ return false;
+
+ tryWorkItems = (int)kernelWorkGroupSize;
+ }
+ }
+
+ _dst.create(size, CV_MAKETYPE(ddepth, cn));
+ UMat dst = _dst.getUMat();
+
+ int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
+ idxArg = kernel.set(idxArg, (int)src.step);
+ int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
+ int srcOffsetY = (int)(src.offset / src.step);
+ int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
+ int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
+ idxArg = kernel.set(idxArg, srcOffsetX);
+ idxArg = kernel.set(idxArg, srcOffsetY);
+ idxArg = kernel.set(idxArg, srcEndX);
+ idxArg = kernel.set(idxArg, srcEndY);
+ idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
+ if (normalize)
+ idxArg = kernel.set(idxArg, (float)alpha);
+
+ return kernel.run(2, globalsize, localsize, false);
+ }
+
+ #endif
+
+ }
+
+
+ cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
+ {
+ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
+ CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
+
+ if( anchor < 0 )
+ anchor = ksize/2;
+
+ if( sdepth == CV_8U && ddepth == CV_32S )
+ return makePtr<RowSum<uchar, int> >(ksize, anchor);
+ if( sdepth == CV_8U && ddepth == CV_16U )
+ return makePtr<RowSum<uchar, ushort> >(ksize, anchor);
+ if( sdepth == CV_8U && ddepth == CV_64F )
+ return makePtr<RowSum<uchar, double> >(ksize, anchor);
+ if( sdepth == CV_16U && ddepth == CV_32S )
+ return makePtr<RowSum<ushort, int> >(ksize, anchor);
+ if( sdepth == CV_16U && ddepth == CV_64F )
+ return makePtr<RowSum<ushort, double> >(ksize, anchor);
+ if( sdepth == CV_16S && ddepth == CV_32S )
+ return makePtr<RowSum<short, int> >(ksize, anchor);
+ if( sdepth == CV_32S && ddepth == CV_32S )
+ return makePtr<RowSum<int, int> >(ksize, anchor);
+ if( sdepth == CV_16S && ddepth == CV_64F )
+ return makePtr<RowSum<short, double> >(ksize, anchor);
+ if( sdepth == CV_32F && ddepth == CV_64F )
+ return makePtr<RowSum<float, double> >(ksize, anchor);
+ if( sdepth == CV_64F && ddepth == CV_64F )
+ return makePtr<RowSum<double, double> >(ksize, anchor);
+
+ CV_Error_( CV_StsNotImplemented,
+ ("Unsupported combination of source format (=%d), and buffer format (=%d)",
+ srcType, sumType));
+ }
+
+
+ cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
+ int anchor, double scale)
+ {
+ int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
+ CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
+
+ if( anchor < 0 )
+ anchor = ksize/2;
+
+ if( ddepth == CV_8U && sdepth == CV_32S )
+ return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
+ if( ddepth == CV_8U && sdepth == CV_16U )
+ return makePtr<ColumnSum<ushort, uchar> >(ksize, anchor, scale);
+ if( ddepth == CV_8U && sdepth == CV_64F )
+ return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
+ if( ddepth == CV_16U && sdepth == CV_32S )
+ return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
+ if( ddepth == CV_16U && sdepth == CV_64F )
+ return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
+ if( ddepth == CV_16S && sdepth == CV_32S )
+ return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
+ if( ddepth == CV_16S && sdepth == CV_64F )
+ return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
+ if( ddepth == CV_32S && sdepth == CV_32S )
+ return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
+ if( ddepth == CV_32F && sdepth == CV_32S )
+ return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
+ if( ddepth == CV_32F && sdepth == CV_64F )
+ return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
+ if( ddepth == CV_64F && sdepth == CV_32S )
+ return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
+ if( ddepth == CV_64F && sdepth == CV_64F )
+ return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
+
+ CV_Error_( CV_StsNotImplemented,
+ ("Unsupported combination of sum format (=%d), and destination format (=%d)",
+ sumType, dstType));
+ }
+
+
+ cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
+ Point anchor, bool normalize, int borderType )
+ {
+ int sdepth = CV_MAT_DEPTH(srcType);
+ int cn = CV_MAT_CN(srcType), sumType = CV_64F;
+ if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U &&
+ ksize.width*ksize.height <= 256 )
+ sumType = CV_16U;
+ else if( sdepth <= CV_32S && (!normalize ||
+ ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
+ sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
+ sumType = CV_32S;
+ sumType = CV_MAKETYPE( sumType, cn );
+
+ Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
+ Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
+ dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
+
+ return makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
+ srcType, dstType, sumType, borderType );
+ }
+
+ #ifdef HAVE_OPENVX
+ namespace cv
+ {
+ namespace ovx {
+ template <> inline bool skipSmallImages<VX_KERNEL_BOX_3x3>(int w, int h) { return w*h < 640 * 480; }
+ }
+ static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth,
+ Size ksize, Point anchor,
+ bool normalize, int borderType)
+ {
+ if (ddepth < 0)
+ ddepth = CV_8UC1;
+ if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize ||
+ _src.cols() < 3 || _src.rows() < 3 ||
+ ksize.width != 3 || ksize.height != 3 ||
+ (anchor.x >= 0 && anchor.x != 1) ||
+ (anchor.y >= 0 && anchor.y != 1) ||
+ ovx::skipSmallImages<VX_KERNEL_BOX_3x3>(_src.cols(), _src.rows()))
+ return false;
+
+ Mat src = _src.getMat();
+
+ if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
+ return false; //Process isolated borders only
+ vx_enum border;
+ switch (borderType & ~BORDER_ISOLATED)
+ {
+ case BORDER_CONSTANT:
+ border = VX_BORDER_CONSTANT;
+ break;
+ case BORDER_REPLICATE:
+ border = VX_BORDER_REPLICATE;
+ break;
+ default:
+ return false;
+ }
+
+ _dst.create(src.size(), CV_8UC1);
+ Mat dst = _dst.getMat();
+
+ try
+ {
+ ivx::Context ctx = ovx::getOpenVXContext();
+
+ Mat a;
+ if (dst.data != src.data)
+ a = src;
+ else
+ src.copyTo(a);
+
+ ivx::Image
+ ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+ ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
+ ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+ ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
+
+ //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
+ //since OpenVX standard says nothing about thread-safety for now
+ ivx::border_t prevBorder = ctx.immediateBorder();
+ ctx.setImmediateBorder(border, (vx_uint8)(0));
+ ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib));
+ ctx.setImmediateBorder(prevBorder);
+ }
+ catch (const ivx::RuntimeError & e)
+ {
+ VX_DbgThrow(e.what());
+ }
+ catch (const ivx::WrapperError & e)
+ {
+ VX_DbgThrow(e.what());
+ }
+
+ return true;
+ }
+ }
+ #endif
+
++#if defined(HAVE_IPP) && OPENCV_IPP_REDUCE_SIZE == 0
+ namespace cv
+ {
+ static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType)
+ {
+ #ifdef HAVE_IPP_IW
+ CV_INSTRUMENT_REGION_IPP();
+
+ #if IPP_VERSION_X100 < 201801
+ // Problem with SSE42 optimization for 16s and some 8u modes
+ if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5))))
+ return false;
+
+ // Other optimizations has some degradations too
+ if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5))))
+ return false;
+ #endif
+
+ if(!normalize)
+ return false;
+
+ if(!ippiCheckAnchor(anchor, ksize))
+ return false;
+
+ try
+ {
+ ::ipp::IwiImage iwSrc = ippiGetImage(src);
+ ::ipp::IwiImage iwDst = ippiGetImage(dst);
+ ::ipp::IwiSize iwKSize = ippiGetSize(ksize);
+ ::ipp::IwiBorderSize borderSize(iwKSize);
+ ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize));
+ if(!ippBorder)
+ return false;
+
+ CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder);
+ }
+ catch (const ::ipp::IwException &)
+ {
+ return false;
+ }
+
+ return true;
+ #else
+ CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType);
+ return false;
+ #endif
+ }
+ }
+ #endif
+
+
+ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
+ Size ksize, Point anchor,
+ bool normalize, int borderType )
+ {
+ CV_INSTRUMENT_REGION();
+
+ CV_OCL_RUN(_dst.isUMat() &&
+ (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT ||
+ borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101),
+ ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
+
+ CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
+
+ Mat src = _src.getMat();
+ int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
+ if( ddepth < 0 )
+ ddepth = sdepth;
+ _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
+ Mat dst = _dst.getMat();
+ if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
+ {
+ if( src.rows == 1 )
+ ksize.height = 1;
+ if( src.cols == 1 )
+ ksize.width = 1;
+ }
+
+ Point ofs;
+ Size wsz(src.cols, src.rows);
+ if(!(borderType&BORDER_ISOLATED))
+ src.locateROI( wsz, ofs );
+
+ CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn,
+ ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
+ anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED);
+
+ CV_OVX_RUN(true,
+ openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType))
+
++#if OPENCV_IPP_REDUCE_SIZE == 0
+ CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType));
++#endif
+
+ borderType = (borderType&~BORDER_ISOLATED);
+
+ Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
+ ksize, anchor, normalize, borderType );
+
+ f->apply( src, dst, wsz, ofs );
+ }
+
+
+ void cv::blur( InputArray src, OutputArray dst,
+ Size ksize, Point anchor, int borderType )
+ {
+ CV_INSTRUMENT_REGION();
+
+ boxFilter( src, dst, -1, ksize, anchor, true, borderType );
+ }
+
+
+ /****************************************************************************************\
+ Squared Box Filter
+ \****************************************************************************************/
+
+ namespace cv
+ {
+
+ template<typename T, typename ST>
+ struct SqrRowSum :
+ public BaseRowFilter
+ {
+ SqrRowSum( int _ksize, int _anchor ) :
+ BaseRowFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ }
+
+ virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
+ {
+ const T* S = (const T*)src;
+ ST* D = (ST*)dst;
+ int i = 0, k, ksz_cn = ksize*cn;
+
+ width = (width - 1)*cn;
+ for( k = 0; k < cn; k++, S++, D++ )
+ {
+ ST s = 0;
+ for( i = 0; i < ksz_cn; i += cn )
+ {
+ ST val = (ST)S[i];
+ s += val*val;
+ }
+ D[0] = s;
+ for( i = 0; i < width; i += cn )
+ {
+ ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
+ s += val1*val1 - val0*val0;
+ D[i+cn] = s;
+ }
+ }
+ }
+ };
+
+ static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
+ {
+ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
+ CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
+
+ if( anchor < 0 )
+ anchor = ksize/2;
+
+ if( sdepth == CV_8U && ddepth == CV_32S )
+ return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
+ if( sdepth == CV_8U && ddepth == CV_64F )
+ return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
+ if( sdepth == CV_16U && ddepth == CV_64F )
+ return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
+ if( sdepth == CV_16S && ddepth == CV_64F )
+ return makePtr<SqrRowSum<short, double> >(ksize, anchor);
+ if( sdepth == CV_32F && ddepth == CV_64F )
+ return makePtr<SqrRowSum<float, double> >(ksize, anchor);
+ if( sdepth == CV_64F && ddepth == CV_64F )
+ return makePtr<SqrRowSum<double, double> >(ksize, anchor);
+
+ CV_Error_( CV_StsNotImplemented,
+ ("Unsupported combination of source format (=%d), and buffer format (=%d)",
+ srcType, sumType));
+ }
+
+ }
+
+ void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
+ Size ksize, Point anchor,
+ bool normalize, int borderType )
+ {
+ CV_INSTRUMENT_REGION();
+
+ int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
+ Size size = _src.size();
+
+ if( ddepth < 0 )
+ ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
+
+ if( borderType != BORDER_CONSTANT && normalize )
+ {
+ if( size.height == 1 )
+ ksize.height = 1;
+ if( size.width == 1 )
+ ksize.width = 1;
+ }
+
+ CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+ ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
+
+ int sumDepth = CV_64F;
+ if( sdepth == CV_8U )
+ sumDepth = CV_32S;
+ int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
+
+ Mat src = _src.getMat();
+ _dst.create( size, dstType );
+ Mat dst = _dst.getMat();
+
+ Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
+ Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
+ dstType, ksize.height, anchor.y,
+ normalize ? 1./(ksize.width*ksize.height) : 1);
+
+ Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
+ srcType, dstType, sumType, borderType );
+ Point ofs;
+ Size wsz(src.cols, src.rows);
+ src.locateROI( wsz, ofs );
+
+ f->apply( src, dst, wsz, ofs );
+ }
+
+ /* End of file. */