TAPI: stiching: add custom OpenCL kernels for MultiBandBlender
authorAlexander Alekhin <alexander.alekhin@itseez.com>
Wed, 26 Feb 2014 15:02:36 +0000 (19:02 +0400)
committerAndrey Pavlenko <andrey.pavlenko@itseez.com>
Fri, 11 Apr 2014 09:01:12 +0000 (13:01 +0400)
modules/core/include/opencv2/core/ocl.hpp
modules/core/include/opencv2/core/utility.hpp
modules/core/src/ocl.cpp
modules/stitching/src/blenders.cpp
modules/stitching/src/opencl/multibandblend.cl [new file with mode: 0644]

index fdb6f9a..254cb10 100644 (file)
@@ -598,6 +598,8 @@ CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noAr
                                          InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
                                          InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());
 
+CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m);
+
 class CV_EXPORTS Image2D
 {
 public:
index 3e844cc..a8957f7 100644 (file)
@@ -495,6 +495,11 @@ template<> inline std::string CommandLineParser::get<std::string>(const String&
 }
 #endif // OPENCV_NOSTL
 
+#if !defined(OPENCV_SKIP_SUPPRESS_WARNING) || !OPENCV_SKIP_SUPPRESS_WARNING
+// Use this to bypass "warning C4127: conditional expression is constant"
+template <typename T> T SuppressWarning(T v) { return v; }
+#endif
+
 } //namespace cv
 
 #endif //__OPENCV_CORE_UTILITY_H__
index 24190c5..2017300 100644 (file)
@@ -4404,7 +4404,24 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
 
 #undef PROCESS_SRC
 
-/////////////////////////////////////////// Image2D ////////////////////////////////////////////////////
+
+// TODO Make this as a method of OpenCL "BuildOptions" class
+void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
+{
+    if (!buildOptions.empty())
+        buildOptions += " ";
+    int type = _m.type(), depth = CV_MAT_DEPTH(type);
+    buildOptions += format(
+            "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
+            name.c_str(), ocl::typeToStr(type),
+            name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+            name.c_str(), (int)CV_MAT_CN(type),
+            name.c_str(), (int)CV_ELEM_SIZE(type),
+            name.c_str(), (int)CV_ELEM_SIZE1(type),
+            name.c_str(), (int)depth
+            );
+}
+
 
 struct Image2D::Impl
 {
index acb4987..a82da97 100644 (file)
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 namespace cv {
 namespace detail {
@@ -245,6 +246,31 @@ void MultiBandBlender::prepare(Rect dst_roi)
     }
 }
 
+#ifdef HAVE_OPENCL
+static bool ocl_MultiBandBlender_feed(InputArray _src, InputArray _weight,
+        InputOutputArray _dst, InputOutputArray _dst_weight)
+{
+    String buildOptions = "-D DEFINE_feed";
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "src", _src);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "weight", _weight);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "dst", _dst);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "dstWeight", _dst_weight);
+    ocl::Kernel k("feed", ocl::stitching::multibandblend_oclsrc, buildOptions);
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+
+    k.args(ocl::KernelArg::ReadOnly(src),
+           ocl::KernelArg::ReadOnly(_weight.getUMat()),
+           ocl::KernelArg::ReadWrite(_dst.getUMat()),
+           ocl::KernelArg::ReadWrite(_dst_weight.getUMat())
+           );
+
+    size_t globalsize[2] = {src.cols, src.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+#endif
 
 void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
 {
@@ -338,63 +364,61 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
     int x_br = br_new.x - dst_roi_.x;
 
     // Add weighted layer of the source image to the final Laplacian pyramid layer
-    if(weight_type_ == CV_32F)
+    for (int i = 0; i <= num_bands_; ++i)
     {
-        for (int i = 0; i <= num_bands_; ++i)
+        Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl);
+        CV_OPENCL_RUN(SuppressWarning(true),
+                ocl_MultiBandBlender_feed(src_pyr_laplace[i], weight_pyr_gauss[i],
+                        dst_pyr_laplace_[i](rc),
+                        dst_band_weights_[i](rc)),
+                goto next_band;)
         {
             Mat _src_pyr_laplace = src_pyr_laplace[i].getMat(ACCESS_READ);
-            Mat _dst_pyr_laplace = dst_pyr_laplace_[i].getMat(ACCESS_RW);
+            Mat _dst_pyr_laplace = dst_pyr_laplace_[i](rc).getMat(ACCESS_RW);
             Mat _weight_pyr_gauss = weight_pyr_gauss[i].getMat(ACCESS_READ);
-            Mat _dst_band_weights = dst_band_weights_[i].getMat(ACCESS_RW);
-            for (int y = y_tl; y < y_br; ++y)
+            Mat _dst_band_weights = dst_band_weights_[i](rc).getMat(ACCESS_RW);
+            if(weight_type_ == CV_32F)
             {
-                int y_ = y - y_tl;
-                const Point3_<short>* src_row = _src_pyr_laplace.ptr<Point3_<short> >(y_);
-                Point3_<short>* dst_row = _dst_pyr_laplace.ptr<Point3_<short> >(y);
-                const float* weight_row = _weight_pyr_gauss.ptr<float>(y_);
-                float* dst_weight_row = _dst_band_weights.ptr<float>(y);
-
-                for (int x = x_tl; x < x_br; ++x)
+                for (int y = 0; y < rc.height; ++y)
                 {
-                    int x_ = x - x_tl;
-                    dst_row[x].x += static_cast<short>(src_row[x_].x * weight_row[x_]);
-                    dst_row[x].y += static_cast<short>(src_row[x_].y * weight_row[x_]);
-                    dst_row[x].z += static_cast<short>(src_row[x_].z * weight_row[x_]);
-                    dst_weight_row[x] += weight_row[x_];
+                    const Point3_<short>* src_row = _src_pyr_laplace.ptr<Point3_<short> >(y);
+                    Point3_<short>* dst_row = _dst_pyr_laplace.ptr<Point3_<short> >(y);
+                    const float* weight_row = _weight_pyr_gauss.ptr<float>(y);
+                    float* dst_weight_row = _dst_band_weights.ptr<float>(y);
+
+                    for (int x = 0; x < rc.width; ++x)
+                    {
+                        dst_row[x].x += static_cast<short>(src_row[x].x * weight_row[x]);
+                        dst_row[x].y += static_cast<short>(src_row[x].y * weight_row[x]);
+                        dst_row[x].z += static_cast<short>(src_row[x].z * weight_row[x]);
+                        dst_weight_row[x] += weight_row[x];
+                    }
                 }
             }
-            x_tl /= 2; y_tl /= 2;
-            x_br /= 2; y_br /= 2;
-        }
-    }
-    else // weight_type_ == CV_16S
-    {
-        for (int i = 0; i <= num_bands_; ++i)
-        {
-            Mat _src_pyr_laplace = src_pyr_laplace[i].getMat(ACCESS_READ);
-            Mat _dst_pyr_laplace = dst_pyr_laplace_[i].getMat(ACCESS_RW);
-            Mat _weight_pyr_gauss = weight_pyr_gauss[i].getMat(ACCESS_READ);
-            Mat _dst_band_weights = dst_band_weights_[i].getMat(ACCESS_RW);
-            for (int y = y_tl; y < y_br; ++y)
+            else // weight_type_ == CV_16S
             {
-                int y_ = y - y_tl;
-                const Point3_<short>* src_row = _src_pyr_laplace.ptr<Point3_<short> >(y_);
-                Point3_<short>* dst_row = _dst_pyr_laplace.ptr<Point3_<short> >(y);
-                const short* weight_row = _weight_pyr_gauss.ptr<short>(y_);
-                short* dst_weight_row = _dst_band_weights.ptr<short>(y);
-
-                for (int x = x_tl; x < x_br; ++x)
+                for (int y = 0; y < y_br - y_tl; ++y)
                 {
-                    int x_ = x - x_tl;
-                    dst_row[x].x += short((src_row[x_].x * weight_row[x_]) >> 8);
-                    dst_row[x].y += short((src_row[x_].y * weight_row[x_]) >> 8);
-                    dst_row[x].z += short((src_row[x_].z * weight_row[x_]) >> 8);
-                    dst_weight_row[x] += weight_row[x_];
+                    const Point3_<short>* src_row = _src_pyr_laplace.ptr<Point3_<short> >(y);
+                    Point3_<short>* dst_row = _dst_pyr_laplace.ptr<Point3_<short> >(y);
+                    const short* weight_row = _weight_pyr_gauss.ptr<short>(y);
+                    short* dst_weight_row = _dst_band_weights.ptr<short>(y);
+
+                    for (int x = 0; x < x_br - x_tl; ++x)
+                    {
+                        dst_row[x].x += short((src_row[x].x * weight_row[x]) >> 8);
+                        dst_row[x].y += short((src_row[x].y * weight_row[x]) >> 8);
+                        dst_row[x].z += short((src_row[x].z * weight_row[x]) >> 8);
+                        dst_weight_row[x] += weight_row[x];
+                    }
                 }
             }
-            x_tl /= 2; y_tl /= 2;
-            x_br /= 2; y_br /= 2;
         }
+#ifdef HAVE_OPENCL
+next_band:
+#endif
+        x_tl /= 2; y_tl /= 2;
+        x_br /= 2; y_br /= 2;
     }
 
     LOGLN("  Add weighted layer of the source image to the final Laplacian pyramid layer, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
@@ -411,10 +435,10 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
     else
         restoreImageFromLaplacePyr(dst_pyr_laplace_);
 
-    dst_ = dst_pyr_laplace_[0];
-    dst_ = dst_(Range(0, dst_roi_final_.height), Range(0, dst_roi_final_.width));
+    Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
+    dst_ = dst_pyr_laplace_[0](dst_rc);
     UMat _dst_mask;
-    compare(dst_band_weights_[0](Range(0, dst_roi_final_.height), Range(0, dst_roi_final_.width)), WEIGHT_EPS, dst_mask_, CMP_GT);
+    compare(dst_band_weights_[0](dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
     dst_pyr_laplace_.clear();
     dst_band_weights_.clear();
 
@@ -425,47 +449,74 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 //////////////////////////////////////////////////////////////////////////////
 // Auxiliary functions
 
+#ifdef HAVE_OPENCL
+static bool ocl_normalizeUsingWeightMap(InputArray _weight, InputOutputArray _mat)
+{
+    String buildOptions = "-D DEFINE_normalizeUsingWeightMap";
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "mat", _mat);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "weight", _weight);
+    ocl::Kernel k("normalizeUsingWeightMap", ocl::stitching::multibandblend_oclsrc, buildOptions);
+    if (k.empty())
+        return false;
+
+    UMat mat = _mat.getUMat();
+
+    k.args(ocl::KernelArg::ReadWrite(mat),
+           ocl::KernelArg::ReadOnly(_weight.getUMat())
+           );
+
+    size_t globalsize[2] = {mat.cols, mat.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+#endif
+
 void normalizeUsingWeightMap(InputArray _weight, InputOutputArray _src)
 {
 #ifdef HAVE_TEGRA_OPTIMIZATION
     if(tegra::normalizeUsingWeightMap(weight, src))
         return;
 #endif
-    Mat weight = _weight.getMat();
-    Mat src = _src.getMat();
-
-    CV_Assert(src.type() == CV_16SC3);
 
-    if(weight.type() == CV_32FC1)
+    CV_OPENCL_RUN(SuppressWarning(true),
+                  ocl_normalizeUsingWeightMap(_weight, _src),
+                  return;)
     {
-        for (int y = 0; y < src.rows; ++y)
-        {
-            Point3_<short> *row = src.ptr<Point3_<short> >(y);
-            const float *weight_row = weight.ptr<float>(y);
+        Mat weight = _weight.getMat();
+        Mat src = _src.getMat();
+
+        CV_Assert(src.type() == CV_16SC3);
 
-            for (int x = 0; x < src.cols; ++x)
+        if(weight.type() == CV_32FC1)
+        {
+            for (int y = 0; y < src.rows; ++y)
             {
-                row[x].x = static_cast<short>(row[x].x / (weight_row[x] + WEIGHT_EPS));
-                row[x].y = static_cast<short>(row[x].y / (weight_row[x] + WEIGHT_EPS));
-                row[x].z = static_cast<short>(row[x].z / (weight_row[x] + WEIGHT_EPS));
+                Point3_<short> *row = src.ptr<Point3_<short> >(y);
+                const float *weight_row = weight.ptr<float>(y);
+
+                for (int x = 0; x < src.cols; ++x)
+                {
+                    row[x].x = static_cast<short>(row[x].x / (weight_row[x] + WEIGHT_EPS));
+                    row[x].y = static_cast<short>(row[x].y / (weight_row[x] + WEIGHT_EPS));
+                    row[x].z = static_cast<short>(row[x].z / (weight_row[x] + WEIGHT_EPS));
+                }
             }
         }
-    }
-    else
-    {
-        CV_Assert(weight.type() == CV_16SC1);
-
-        for (int y = 0; y < src.rows; ++y)
+        else
         {
-            const short *weight_row = weight.ptr<short>(y);
-            Point3_<short> *row = src.ptr<Point3_<short> >(y);
+            CV_Assert(weight.type() == CV_16SC1);
 
-            for (int x = 0; x < src.cols; ++x)
+            for (int y = 0; y < src.rows; ++y)
             {
-                int w = weight_row[x] + 1;
-                row[x].x = static_cast<short>((row[x].x << 8) / w);
-                row[x].y = static_cast<short>((row[x].y << 8) / w);
-                row[x].z = static_cast<short>((row[x].z << 8) / w);
+                const short *weight_row = weight.ptr<short>(y);
+                Point3_<short> *row = src.ptr<Point3_<short> >(y);
+
+                for (int x = 0; x < src.cols; ++x)
+                {
+                    int w = weight_row[x] + 1;
+                    row[x].x = static_cast<short>((row[x].x << 8) / w);
+                    row[x].y = static_cast<short>((row[x].y << 8) / w);
+                    row[x].z = static_cast<short>((row[x].z << 8) / w);
+                }
             }
         }
     }
diff --git a/modules/stitching/src/opencl/multibandblend.cl b/modules/stitching/src/opencl/multibandblend.cl
new file mode 100644 (file)
index 0000000..d42ad82
--- /dev/null
@@ -0,0 +1,282 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
+
+//
+// Common preprocessors macro
+//
+
+//
+// TODO: Move this common code into "header" file
+//
+
+#ifndef NL // New Line: for preprocessor debugging
+#define NL
+#endif
+
+#define REF(x) x
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+//
+// All matrixes are come with this description ("name" is a name of matrix):
+// * name_CN - number of channels (1,2,3,4)
+// * name_DEPTH - numeric value of CV_MAT_DEPTH(type). See CV_8U, CV_32S, etc macro below.
+//
+// Currently we also pass these attributes (to reduce this macro block):
+// * name_T - datatype (int, float, uchar4, float4)
+// * name_T1 - datatype for one channel (int, float, uchar).
+//   It is equal to result of "T1(name_T)" macro
+// * name_TSIZE - CV_ELEM_SIZE(type).
+//   We can't use sizeof(name_T) here, because sizeof(float3) is usually equal to 8, not 6.
+// * name_T1SIZE - CV_ELEM_SIZE1(type)
+//
+
+//
+// Usage sample:
+//
+// #define workType TYPE(float, src_CN)
+// #define convertToWorkType CONVERT_TO(workType)
+// #define convertWorkTypeToDstType CONVERT(workType, dst_T)
+//
+// __kernel void kernelFn(DECLARE_MAT_ARG(src), DECLARE_MAT_ARG(dst))
+// {
+//     const int x = get_global_id(0);
+//     const int y = get_global_id(1);
+//
+//     if (x < srcWidth && y < srcHeight)
+//     {
+//         int src_byteOffset = MAT_BYTE_OFFSET(src, x, y);
+//         int dst_byteOffset = MAT_BYTE_OFFSET(dst, x, y);
+//         workType value = convertToWorkType(LOAD_MAT_AT(src, src_byteOffset));
+//
+//         ... value processing ...
+//
+//         STORE_MAT_AT(dst, dst_byteOffset, convertWorkTypeToDstType(value));
+//     }
+// }
+//
+
+#define DECLARE_MAT_ARG(name) \
+    __global uchar* restrict name ## Ptr, \
+    int name ## StepBytes, \
+    int name ## Offset, \
+    int name ## Height, \
+    int name ## Width NL
+
+#define MAT_BYTE_OFFSET(name, x, y) mad24((y)/* + name ## OffsetY*/, name ## StepBytes, ((x)/* + name ## OffsetX*/) * (int)(name ## _TSIZE) + name ## Offset)
+#define MAT_RELATIVE_BYTE_OFFSET(name, x, y) mad24(y, name ## StepBytes, (x) * (int)(name ## _TSIZE))
+
+#define __LOAD_MAT_AT(name, byteOffset) *((const __global name ## _T*)(name ## Ptr + (byteOffset)))
+#define __vload_CN__(name_cn) vload ## name_cn
+#define __vload_CN_(name_cn) __vload_CN__(name_cn)
+#define __vload_CN(name) __vload_CN_(name ## _CN)
+#define __LOAD_MAT_AT_vload(name, byteOffset) __vload_CN(name)(0, ((const __global name ## _T1*)(name ## Ptr + (byteOffset))))
+#define __LOAD_MAT_AT_1 __LOAD_MAT_AT
+#define __LOAD_MAT_AT_2 __LOAD_MAT_AT
+#define __LOAD_MAT_AT_3 __LOAD_MAT_AT_vload
+#define __LOAD_MAT_AT_4 __LOAD_MAT_AT
+#define __LOAD_MAT_AT_CN__(name_cn) __LOAD_MAT_AT_ ## name_cn
+#define __LOAD_MAT_AT_CN_(name_cn) __LOAD_MAT_AT_CN__(name_cn)
+#define __LOAD_MAT_AT_CN(name) __LOAD_MAT_AT_CN_(name ## _CN)
+#define LOAD_MAT_AT(name, byteOffset) __LOAD_MAT_AT_CN(name)(name, byteOffset)
+
+#define __STORE_MAT_AT(name, byteOffset, v) *((__global name ## _T*)(name ## Ptr + (byteOffset))) = v
+#define __vstore_CN__(name_cn) vstore ## name_cn
+#define __vstore_CN_(name_cn) __vstore_CN__(name_cn)
+#define __vstore_CN(name) __vstore_CN_(name ## _CN)
+#define __STORE_MAT_AT_vstore(name, byteOffset, v) __vstore_CN(name)(v, 0, ((__global name ## _T1*)(name ## Ptr + (byteOffset))))
+#define __STORE_MAT_AT_1 __STORE_MAT_AT
+#define __STORE_MAT_AT_2 __STORE_MAT_AT
+#define __STORE_MAT_AT_3 __STORE_MAT_AT_vstore
+#define __STORE_MAT_AT_4 __STORE_MAT_AT
+#define __STORE_MAT_AT_CN__(name_cn) __STORE_MAT_AT_ ## name_cn
+#define __STORE_MAT_AT_CN_(name_cn) __STORE_MAT_AT_CN__(name_cn)
+#define __STORE_MAT_AT_CN(name) __STORE_MAT_AT_CN_(name ## _CN)
+#define STORE_MAT_AT(name, byteOffset, v) __STORE_MAT_AT_CN(name)(name, byteOffset, v)
+
+#define T1_uchar uchar
+#define T1_uchar2 uchar
+#define T1_uchar3 uchar
+#define T1_uchar4 uchar
+#define T1_char char
+#define T1_char2 char
+#define T1_char3 char
+#define T1_char4 char
+#define T1_ushort ushort
+#define T1_ushort2 ushort
+#define T1_ushort3 ushort
+#define T1_ushort4 ushort
+#define T1_short short
+#define T1_short2 short
+#define T1_short3 short
+#define T1_short4 short
+#define T1_int int
+#define T1_int2 int
+#define T1_int3 int
+#define T1_int4 int
+#define T1_float float
+#define T1_float2 float
+#define T1_float3 float
+#define T1_float4 float
+#define T1_double double
+#define T1_double2 double
+#define T1_double3 double
+#define T1_double4 double
+#define T1(type) REF(CAT(T1_, REF(type)))
+
+#define uchar1 uchar
+#define char1 char
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define float1 float
+#define double1 double
+#define TYPE(type, cn) REF(CAT(REF(type), REF(cn)))
+
+#define __CONVERT_MODE_uchar_uchar __NO_CONVERT
+#define __CONVERT_MODE_uchar_char __CONVERT_sat
+#define __CONVERT_MODE_uchar_ushort __CONVERT
+#define __CONVERT_MODE_uchar_short __CONVERT
+#define __CONVERT_MODE_uchar_int __CONVERT
+#define __CONVERT_MODE_uchar_float __CONVERT
+#define __CONVERT_MODE_uchar_double __CONVERT
+#define __CONVERT_MODE_char_uchar __CONVERT_sat
+#define __CONVERT_MODE_char_char __NO_CONVERT
+#define __CONVERT_MODE_char_ushort __CONVERT_sat
+#define __CONVERT_MODE_char_short __CONVERT
+#define __CONVERT_MODE_char_int __CONVERT
+#define __CONVERT_MODE_char_float __CONVERT
+#define __CONVERT_MODE_char_double __CONVERT
+#define __CONVERT_MODE_ushort_uchar __CONVERT_sat
+#define __CONVERT_MODE_ushort_char __CONVERT_sat
+#define __CONVERT_MODE_ushort_ushort __NO_CONVERT
+#define __CONVERT_MODE_ushort_short __CONVERT_sat
+#define __CONVERT_MODE_ushort_int __CONVERT
+#define __CONVERT_MODE_ushort_float __CONVERT
+#define __CONVERT_MODE_ushort_double __CONVERT
+#define __CONVERT_MODE_short_uchar __CONVERT_sat
+#define __CONVERT_MODE_short_char __CONVERT_sat
+#define __CONVERT_MODE_short_ushort __CONVERT_sat
+#define __CONVERT_MODE_short_short __NO_CONVERT
+#define __CONVERT_MODE_short_int __CONVERT
+#define __CONVERT_MODE_short_float __CONVERT
+#define __CONVERT_MODE_short_double __CONVERT
+#define __CONVERT_MODE_int_uchar __CONVERT_sat
+#define __CONVERT_MODE_int_char __CONVERT_sat
+#define __CONVERT_MODE_int_ushort __CONVERT_sat
+#define __CONVERT_MODE_int_short __CONVERT_sat
+#define __CONVERT_MODE_int_int __NO_CONVERT
+#define __CONVERT_MODE_int_float __CONVERT
+#define __CONVERT_MODE_int_double __CONVERT
+#define __CONVERT_MODE_float_uchar __CONVERT_sat_rte
+#define __CONVERT_MODE_float_char __CONVERT_sat_rte
+#define __CONVERT_MODE_float_ushort __CONVERT_sat_rte
+#define __CONVERT_MODE_float_short __CONVERT_sat_rte
+#define __CONVERT_MODE_float_int __CONVERT_rte
+#define __CONVERT_MODE_float_float __NO_CONVERT
+#define __CONVERT_MODE_float_double __CONVERT
+#define __CONVERT_MODE_double_uchar __CONVERT_sat_rte
+#define __CONVERT_MODE_double_char __CONVERT_sat_rte
+#define __CONVERT_MODE_double_ushort __CONVERT_sat_rte
+#define __CONVERT_MODE_double_short __CONVERT_sat_rte
+#define __CONVERT_MODE_double_int __CONVERT_rte
+#define __CONVERT_MODE_double_float __CONVERT
+#define __CONVERT_MODE_double_double __NO_CONVERT
+#define __CONVERT_MODE(srcType, dstType) CAT(__CONVERT_MODE_, CAT(REF(T1(srcType)), CAT(_, REF(T1(dstType)))))
+
+#define __ROUND_MODE__NO_CONVERT
+#define __ROUND_MODE__CONVERT // nothing
+#define __ROUND_MODE__CONVERT_rte _rte
+#define __ROUND_MODE__CONVERT_sat _sat
+#define __ROUND_MODE__CONVERT_sat_rte _sat_rte
+#define ROUND_MODE(srcType, dstType) CAT(__ROUND_MODE_, __CONVERT_MODE(srcType, dstType))
+
+#define __CONVERT_ROUND(dstType, roundMode) CAT(CAT(convert_, REF(dstType)), roundMode)
+#define __NO_CONVERT(dstType) // nothing
+#define __CONVERT(dstType) __CONVERT_ROUND(dstType,)
+#define __CONVERT_rte(dstType) __CONVERT_ROUND(dstType,_rte)
+#define __CONVERT_sat(dstType) __CONVERT_ROUND(dstType,_sat)
+#define __CONVERT_sat_rte(dstType) __CONVERT_ROUND(dstType,_sat_rte)
+#define CONVERT(srcType, dstType) REF(__CONVERT_MODE(srcType,dstType))(dstType)
+#define CONVERT_TO(dstType) __CONVERT_ROUND(dstType,)
+
+// OpenCV depths
+#define CV_8U   0
+#define CV_8S   1
+#define CV_16U  2
+#define CV_16S  3
+#define CV_32S  4
+#define CV_32F  5
+#define CV_64F  6
+
+//
+// End of common preprocessors macro
+//
+
+
+
+#if defined(DEFINE_feed)
+
+#define workType TYPE(weight_T1, src_CN)
+#define convertSrcToWorkType CONVERT_TO(workType)
+#define convertWorkTypeToDstType CONVERT(workType, dst_T)
+
+__kernel void feed(
+        DECLARE_MAT_ARG(src), DECLARE_MAT_ARG(weight),
+        DECLARE_MAT_ARG(dst), DECLARE_MAT_ARG(dstWeight)
+)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x < srcWidth && y < srcHeight)
+    {
+        int src_byteOffset = MAT_BYTE_OFFSET(src, x, y);
+        int weight_byteOffset = MAT_BYTE_OFFSET(weight, x, y);
+        int dst_byteOffset = MAT_BYTE_OFFSET(dst, x, y);
+        int dstWeight_byteOffset = MAT_BYTE_OFFSET(dstWeight, x, y);
+
+        weight_T w = LOAD_MAT_AT(weight, weight_byteOffset);
+        workType src_value = convertSrcToWorkType(LOAD_MAT_AT(src, src_byteOffset));
+        STORE_MAT_AT(dst, dst_byteOffset, LOAD_MAT_AT(dst, dst_byteOffset) + convertWorkTypeToDstType(src_value * w));
+        STORE_MAT_AT(dstWeight, dstWeight_byteOffset, LOAD_MAT_AT(dstWeight, dstWeight_byteOffset) + w);
+    }
+}
+
+#endif
+
+#if defined(DEFINE_normalizeUsingWeightMap)
+
+#define workType TYPE(weight_T1, mat_CN)
+#define convertSrcToWorkType CONVERT_TO(workType)
+#define convertWorkTypeToDstType CONVERT(workType, mat_T)
+
+#if weight_DEPTH >= CV_32F
+#define WEIGHT_EPS 1e-5f
+#else
+#define WEIGHT_EPS 0
+#endif
+
+__kernel void normalizeUsingWeightMap(
+        DECLARE_MAT_ARG(mat), DECLARE_MAT_ARG(weight)
+)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x < matWidth && y < matHeight)
+    {
+        int mat_byteOffset = MAT_BYTE_OFFSET(mat, x, y);
+        int weight_byteOffset = MAT_BYTE_OFFSET(weight, x, y);
+
+        weight_T w = LOAD_MAT_AT(weight, weight_byteOffset);
+        workType value = convertSrcToWorkType(LOAD_MAT_AT(mat, mat_byteOffset));
+        value = value / (w + WEIGHT_EPS);
+        STORE_MAT_AT(mat, mat_byteOffset, convertWorkTypeToDstType(value));
+    }
+}
+
+#endif