From: Alexander Alekhin Date: Wed, 26 Feb 2014 15:02:36 +0000 (+0400) Subject: TAPI: stiching: add custom OpenCL kernels for MultiBandBlender X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~3303^2~5 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=06738468af9b706e423a8ccbe98374f288f42c18;p=platform%2Fupstream%2Fopencv.git TAPI: stiching: add custom OpenCL kernels for MultiBandBlender --- diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index fdb6f9a..254cb10 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -598,6 +598,8 @@ CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noAr InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(), InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray()); +CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m); + class CV_EXPORTS Image2D { public: diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index 3e844cc..a8957f7 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -495,6 +495,11 @@ template<> inline std::string CommandLineParser::get(const String& } #endif // OPENCV_NOSTL +#if !defined(OPENCV_SKIP_SUPPRESS_WARNING) || !OPENCV_SKIP_SUPPRESS_WARNING +// Use this to bypass "warning C4127: conditional expression is constant" +template T SuppressWarning(T v) { return v; } +#endif + } //namespace cv #endif //__OPENCV_CORE_UTILITY_H__ diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 24190c5..2017300 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -4404,7 +4404,24 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3, #undef PROCESS_SRC -/////////////////////////////////////////// Image2D //////////////////////////////////////////////////// + +// TODO Make this as a method of OpenCL "BuildOptions" class +void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m) +{ + if (!buildOptions.empty()) + buildOptions += " "; + int type = _m.type(), depth = CV_MAT_DEPTH(type); + buildOptions += format( + "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d", + name.c_str(), ocl::typeToStr(type), + name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), + name.c_str(), (int)CV_MAT_CN(type), + name.c_str(), (int)CV_ELEM_SIZE(type), + name.c_str(), (int)CV_ELEM_SIZE1(type), + name.c_str(), (int)depth + ); +} + struct Image2D::Impl { diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp index acb4987..a82da97 100644 --- a/modules/stitching/src/blenders.cpp +++ b/modules/stitching/src/blenders.cpp @@ -41,6 +41,7 @@ //M*/ #include "precomp.hpp" +#include "opencl_kernels.hpp" namespace cv { namespace detail { @@ -245,6 +246,31 @@ void MultiBandBlender::prepare(Rect dst_roi) } } +#ifdef HAVE_OPENCL +static bool ocl_MultiBandBlender_feed(InputArray _src, InputArray _weight, + InputOutputArray _dst, InputOutputArray _dst_weight) +{ + String buildOptions = "-D DEFINE_feed"; + ocl::buildOptionsAddMatrixDescription(buildOptions, "src", _src); + ocl::buildOptionsAddMatrixDescription(buildOptions, "weight", _weight); + ocl::buildOptionsAddMatrixDescription(buildOptions, "dst", _dst); + ocl::buildOptionsAddMatrixDescription(buildOptions, "dstWeight", _dst_weight); + ocl::Kernel k("feed", ocl::stitching::multibandblend_oclsrc, buildOptions); + if (k.empty()) + return false; + + UMat src = _src.getUMat(); + + k.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::ReadOnly(_weight.getUMat()), + ocl::KernelArg::ReadWrite(_dst.getUMat()), + ocl::KernelArg::ReadWrite(_dst_weight.getUMat()) + ); + + size_t globalsize[2] = {src.cols, src.rows }; + return k.run(2, globalsize, NULL, false); +} +#endif void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl) { @@ -338,63 +364,61 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl) int x_br = br_new.x - dst_roi_.x; // Add weighted layer of the source image to the final Laplacian pyramid layer - if(weight_type_ == CV_32F) + for (int i = 0; i <= num_bands_; ++i) { - for (int i = 0; i <= num_bands_; ++i) + Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl); + CV_OPENCL_RUN(SuppressWarning(true), + ocl_MultiBandBlender_feed(src_pyr_laplace[i], weight_pyr_gauss[i], + dst_pyr_laplace_[i](rc), + dst_band_weights_[i](rc)), + goto next_band;) { Mat _src_pyr_laplace = src_pyr_laplace[i].getMat(ACCESS_READ); - Mat _dst_pyr_laplace = dst_pyr_laplace_[i].getMat(ACCESS_RW); + Mat _dst_pyr_laplace = dst_pyr_laplace_[i](rc).getMat(ACCESS_RW); Mat _weight_pyr_gauss = weight_pyr_gauss[i].getMat(ACCESS_READ); - Mat _dst_band_weights = dst_band_weights_[i].getMat(ACCESS_RW); - for (int y = y_tl; y < y_br; ++y) + Mat _dst_band_weights = dst_band_weights_[i](rc).getMat(ACCESS_RW); + if(weight_type_ == CV_32F) { - int y_ = y - y_tl; - const Point3_* src_row = _src_pyr_laplace.ptr >(y_); - Point3_* dst_row = _dst_pyr_laplace.ptr >(y); - const float* weight_row = _weight_pyr_gauss.ptr(y_); - float* dst_weight_row = _dst_band_weights.ptr(y); - - for (int x = x_tl; x < x_br; ++x) + for (int y = 0; y < rc.height; ++y) { - int x_ = x - x_tl; - dst_row[x].x += static_cast(src_row[x_].x * weight_row[x_]); - dst_row[x].y += static_cast(src_row[x_].y * weight_row[x_]); - dst_row[x].z += static_cast(src_row[x_].z * weight_row[x_]); - dst_weight_row[x] += weight_row[x_]; + const Point3_* src_row = _src_pyr_laplace.ptr >(y); + Point3_* dst_row = _dst_pyr_laplace.ptr >(y); + const float* weight_row = _weight_pyr_gauss.ptr(y); + float* dst_weight_row = _dst_band_weights.ptr(y); + + for (int x = 0; x < rc.width; ++x) + { + dst_row[x].x += static_cast(src_row[x].x * weight_row[x]); + dst_row[x].y += static_cast(src_row[x].y * weight_row[x]); + dst_row[x].z += static_cast(src_row[x].z * weight_row[x]); + dst_weight_row[x] += weight_row[x]; + } } } - x_tl /= 2; y_tl /= 2; - x_br /= 2; y_br /= 2; - } - } - else // weight_type_ == CV_16S - { - for (int i = 0; i <= num_bands_; ++i) - { - Mat _src_pyr_laplace = src_pyr_laplace[i].getMat(ACCESS_READ); - Mat _dst_pyr_laplace = dst_pyr_laplace_[i].getMat(ACCESS_RW); - Mat _weight_pyr_gauss = weight_pyr_gauss[i].getMat(ACCESS_READ); - Mat _dst_band_weights = dst_band_weights_[i].getMat(ACCESS_RW); - for (int y = y_tl; y < y_br; ++y) + else // weight_type_ == CV_16S { - int y_ = y - y_tl; - const Point3_* src_row = _src_pyr_laplace.ptr >(y_); - Point3_* dst_row = _dst_pyr_laplace.ptr >(y); - const short* weight_row = _weight_pyr_gauss.ptr(y_); - short* dst_weight_row = _dst_band_weights.ptr(y); - - for (int x = x_tl; x < x_br; ++x) + for (int y = 0; y < y_br - y_tl; ++y) { - int x_ = x - x_tl; - dst_row[x].x += short((src_row[x_].x * weight_row[x_]) >> 8); - dst_row[x].y += short((src_row[x_].y * weight_row[x_]) >> 8); - dst_row[x].z += short((src_row[x_].z * weight_row[x_]) >> 8); - dst_weight_row[x] += weight_row[x_]; + const Point3_* src_row = _src_pyr_laplace.ptr >(y); + Point3_* dst_row = _dst_pyr_laplace.ptr >(y); + const short* weight_row = _weight_pyr_gauss.ptr(y); + short* dst_weight_row = _dst_band_weights.ptr(y); + + for (int x = 0; x < x_br - x_tl; ++x) + { + dst_row[x].x += short((src_row[x].x * weight_row[x]) >> 8); + dst_row[x].y += short((src_row[x].y * weight_row[x]) >> 8); + dst_row[x].z += short((src_row[x].z * weight_row[x]) >> 8); + dst_weight_row[x] += weight_row[x]; + } } } - x_tl /= 2; y_tl /= 2; - x_br /= 2; y_br /= 2; } +#ifdef HAVE_OPENCL +next_band: +#endif + x_tl /= 2; y_tl /= 2; + x_br /= 2; y_br /= 2; } LOGLN(" Add weighted layer of the source image to the final Laplacian pyramid layer, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec"); @@ -411,10 +435,10 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask) else restoreImageFromLaplacePyr(dst_pyr_laplace_); - dst_ = dst_pyr_laplace_[0]; - dst_ = dst_(Range(0, dst_roi_final_.height), Range(0, dst_roi_final_.width)); + Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height); + dst_ = dst_pyr_laplace_[0](dst_rc); UMat _dst_mask; - compare(dst_band_weights_[0](Range(0, dst_roi_final_.height), Range(0, dst_roi_final_.width)), WEIGHT_EPS, dst_mask_, CMP_GT); + compare(dst_band_weights_[0](dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT); dst_pyr_laplace_.clear(); dst_band_weights_.clear(); @@ -425,47 +449,74 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask) ////////////////////////////////////////////////////////////////////////////// // Auxiliary functions +#ifdef HAVE_OPENCL +static bool ocl_normalizeUsingWeightMap(InputArray _weight, InputOutputArray _mat) +{ + String buildOptions = "-D DEFINE_normalizeUsingWeightMap"; + ocl::buildOptionsAddMatrixDescription(buildOptions, "mat", _mat); + ocl::buildOptionsAddMatrixDescription(buildOptions, "weight", _weight); + ocl::Kernel k("normalizeUsingWeightMap", ocl::stitching::multibandblend_oclsrc, buildOptions); + if (k.empty()) + return false; + + UMat mat = _mat.getUMat(); + + k.args(ocl::KernelArg::ReadWrite(mat), + ocl::KernelArg::ReadOnly(_weight.getUMat()) + ); + + size_t globalsize[2] = {mat.cols, mat.rows }; + return k.run(2, globalsize, NULL, false); +} +#endif + void normalizeUsingWeightMap(InputArray _weight, InputOutputArray _src) { #ifdef HAVE_TEGRA_OPTIMIZATION if(tegra::normalizeUsingWeightMap(weight, src)) return; #endif - Mat weight = _weight.getMat(); - Mat src = _src.getMat(); - - CV_Assert(src.type() == CV_16SC3); - if(weight.type() == CV_32FC1) + CV_OPENCL_RUN(SuppressWarning(true), + ocl_normalizeUsingWeightMap(_weight, _src), + return;) { - for (int y = 0; y < src.rows; ++y) - { - Point3_ *row = src.ptr >(y); - const float *weight_row = weight.ptr(y); + Mat weight = _weight.getMat(); + Mat src = _src.getMat(); + + CV_Assert(src.type() == CV_16SC3); - for (int x = 0; x < src.cols; ++x) + if(weight.type() == CV_32FC1) + { + for (int y = 0; y < src.rows; ++y) { - row[x].x = static_cast(row[x].x / (weight_row[x] + WEIGHT_EPS)); - row[x].y = static_cast(row[x].y / (weight_row[x] + WEIGHT_EPS)); - row[x].z = static_cast(row[x].z / (weight_row[x] + WEIGHT_EPS)); + Point3_ *row = src.ptr >(y); + const float *weight_row = weight.ptr(y); + + for (int x = 0; x < src.cols; ++x) + { + row[x].x = static_cast(row[x].x / (weight_row[x] + WEIGHT_EPS)); + row[x].y = static_cast(row[x].y / (weight_row[x] + WEIGHT_EPS)); + row[x].z = static_cast(row[x].z / (weight_row[x] + WEIGHT_EPS)); + } } } - } - else - { - CV_Assert(weight.type() == CV_16SC1); - - for (int y = 0; y < src.rows; ++y) + else { - const short *weight_row = weight.ptr(y); - Point3_ *row = src.ptr >(y); + CV_Assert(weight.type() == CV_16SC1); - for (int x = 0; x < src.cols; ++x) + for (int y = 0; y < src.rows; ++y) { - int w = weight_row[x] + 1; - row[x].x = static_cast((row[x].x << 8) / w); - row[x].y = static_cast((row[x].y << 8) / w); - row[x].z = static_cast((row[x].z << 8) / w); + const short *weight_row = weight.ptr(y); + Point3_ *row = src.ptr >(y); + + for (int x = 0; x < src.cols; ++x) + { + int w = weight_row[x] + 1; + row[x].x = static_cast((row[x].x << 8) / w); + row[x].y = static_cast((row[x].y << 8) / w); + row[x].z = static_cast((row[x].z << 8) / w); + } } } } diff --git a/modules/stitching/src/opencl/multibandblend.cl b/modules/stitching/src/opencl/multibandblend.cl new file mode 100644 index 0000000..d42ad82 --- /dev/null +++ b/modules/stitching/src/opencl/multibandblend.cl @@ -0,0 +1,282 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2014, Itseez, Inc, all rights reserved. + +// +// Common preprocessors macro +// + +// +// TODO: Move this common code into "header" file +// + +#ifndef NL // New Line: for preprocessor debugging +#define NL +#endif + +#define REF(x) x +#define __CAT(x, y) x##y +#define CAT(x, y) __CAT(x, y) + +// +// All matrixes are come with this description ("name" is a name of matrix): +// * name_CN - number of channels (1,2,3,4) +// * name_DEPTH - numeric value of CV_MAT_DEPTH(type). See CV_8U, CV_32S, etc macro below. +// +// Currently we also pass these attributes (to reduce this macro block): +// * name_T - datatype (int, float, uchar4, float4) +// * name_T1 - datatype for one channel (int, float, uchar). +// It is equal to result of "T1(name_T)" macro +// * name_TSIZE - CV_ELEM_SIZE(type). +// We can't use sizeof(name_T) here, because sizeof(float3) is usually equal to 8, not 6. +// * name_T1SIZE - CV_ELEM_SIZE1(type) +// + +// +// Usage sample: +// +// #define workType TYPE(float, src_CN) +// #define convertToWorkType CONVERT_TO(workType) +// #define convertWorkTypeToDstType CONVERT(workType, dst_T) +// +// __kernel void kernelFn(DECLARE_MAT_ARG(src), DECLARE_MAT_ARG(dst)) +// { +// const int x = get_global_id(0); +// const int y = get_global_id(1); +// +// if (x < srcWidth && y < srcHeight) +// { +// int src_byteOffset = MAT_BYTE_OFFSET(src, x, y); +// int dst_byteOffset = MAT_BYTE_OFFSET(dst, x, y); +// workType value = convertToWorkType(LOAD_MAT_AT(src, src_byteOffset)); +// +// ... value processing ... +// +// STORE_MAT_AT(dst, dst_byteOffset, convertWorkTypeToDstType(value)); +// } +// } +// + +#define DECLARE_MAT_ARG(name) \ + __global uchar* restrict name ## Ptr, \ + int name ## StepBytes, \ + int name ## Offset, \ + int name ## Height, \ + int name ## Width NL + +#define MAT_BYTE_OFFSET(name, x, y) mad24((y)/* + name ## OffsetY*/, name ## StepBytes, ((x)/* + name ## OffsetX*/) * (int)(name ## _TSIZE) + name ## Offset) +#define MAT_RELATIVE_BYTE_OFFSET(name, x, y) mad24(y, name ## StepBytes, (x) * (int)(name ## _TSIZE)) + +#define __LOAD_MAT_AT(name, byteOffset) *((const __global name ## _T*)(name ## Ptr + (byteOffset))) +#define __vload_CN__(name_cn) vload ## name_cn +#define __vload_CN_(name_cn) __vload_CN__(name_cn) +#define __vload_CN(name) __vload_CN_(name ## _CN) +#define __LOAD_MAT_AT_vload(name, byteOffset) __vload_CN(name)(0, ((const __global name ## _T1*)(name ## Ptr + (byteOffset)))) +#define __LOAD_MAT_AT_1 __LOAD_MAT_AT +#define __LOAD_MAT_AT_2 __LOAD_MAT_AT +#define __LOAD_MAT_AT_3 __LOAD_MAT_AT_vload +#define __LOAD_MAT_AT_4 __LOAD_MAT_AT +#define __LOAD_MAT_AT_CN__(name_cn) __LOAD_MAT_AT_ ## name_cn +#define __LOAD_MAT_AT_CN_(name_cn) __LOAD_MAT_AT_CN__(name_cn) +#define __LOAD_MAT_AT_CN(name) __LOAD_MAT_AT_CN_(name ## _CN) +#define LOAD_MAT_AT(name, byteOffset) __LOAD_MAT_AT_CN(name)(name, byteOffset) + +#define __STORE_MAT_AT(name, byteOffset, v) *((__global name ## _T*)(name ## Ptr + (byteOffset))) = v +#define __vstore_CN__(name_cn) vstore ## name_cn +#define __vstore_CN_(name_cn) __vstore_CN__(name_cn) +#define __vstore_CN(name) __vstore_CN_(name ## _CN) +#define __STORE_MAT_AT_vstore(name, byteOffset, v) __vstore_CN(name)(v, 0, ((__global name ## _T1*)(name ## Ptr + (byteOffset)))) +#define __STORE_MAT_AT_1 __STORE_MAT_AT +#define __STORE_MAT_AT_2 __STORE_MAT_AT +#define __STORE_MAT_AT_3 __STORE_MAT_AT_vstore +#define __STORE_MAT_AT_4 __STORE_MAT_AT +#define __STORE_MAT_AT_CN__(name_cn) __STORE_MAT_AT_ ## name_cn +#define __STORE_MAT_AT_CN_(name_cn) __STORE_MAT_AT_CN__(name_cn) +#define __STORE_MAT_AT_CN(name) __STORE_MAT_AT_CN_(name ## _CN) +#define STORE_MAT_AT(name, byteOffset, v) __STORE_MAT_AT_CN(name)(name, byteOffset, v) + +#define T1_uchar uchar +#define T1_uchar2 uchar +#define T1_uchar3 uchar +#define T1_uchar4 uchar +#define T1_char char +#define T1_char2 char +#define T1_char3 char +#define T1_char4 char +#define T1_ushort ushort +#define T1_ushort2 ushort +#define T1_ushort3 ushort +#define T1_ushort4 ushort +#define T1_short short +#define T1_short2 short +#define T1_short3 short +#define T1_short4 short +#define T1_int int +#define T1_int2 int +#define T1_int3 int +#define T1_int4 int +#define T1_float float +#define T1_float2 float +#define T1_float3 float +#define T1_float4 float +#define T1_double double +#define T1_double2 double +#define T1_double3 double +#define T1_double4 double +#define T1(type) REF(CAT(T1_, REF(type))) + +#define uchar1 uchar +#define char1 char +#define short1 short +#define ushort1 ushort +#define int1 int +#define float1 float +#define double1 double +#define TYPE(type, cn) REF(CAT(REF(type), REF(cn))) + +#define __CONVERT_MODE_uchar_uchar __NO_CONVERT +#define __CONVERT_MODE_uchar_char __CONVERT_sat +#define __CONVERT_MODE_uchar_ushort __CONVERT +#define __CONVERT_MODE_uchar_short __CONVERT +#define __CONVERT_MODE_uchar_int __CONVERT +#define __CONVERT_MODE_uchar_float __CONVERT +#define __CONVERT_MODE_uchar_double __CONVERT +#define __CONVERT_MODE_char_uchar __CONVERT_sat +#define __CONVERT_MODE_char_char __NO_CONVERT +#define __CONVERT_MODE_char_ushort __CONVERT_sat +#define __CONVERT_MODE_char_short __CONVERT +#define __CONVERT_MODE_char_int __CONVERT +#define __CONVERT_MODE_char_float __CONVERT +#define __CONVERT_MODE_char_double __CONVERT +#define __CONVERT_MODE_ushort_uchar __CONVERT_sat +#define __CONVERT_MODE_ushort_char __CONVERT_sat +#define __CONVERT_MODE_ushort_ushort __NO_CONVERT +#define __CONVERT_MODE_ushort_short __CONVERT_sat +#define __CONVERT_MODE_ushort_int __CONVERT +#define __CONVERT_MODE_ushort_float __CONVERT +#define __CONVERT_MODE_ushort_double __CONVERT +#define __CONVERT_MODE_short_uchar __CONVERT_sat +#define __CONVERT_MODE_short_char __CONVERT_sat +#define __CONVERT_MODE_short_ushort __CONVERT_sat +#define __CONVERT_MODE_short_short __NO_CONVERT +#define __CONVERT_MODE_short_int __CONVERT +#define __CONVERT_MODE_short_float __CONVERT +#define __CONVERT_MODE_short_double __CONVERT +#define __CONVERT_MODE_int_uchar __CONVERT_sat +#define __CONVERT_MODE_int_char __CONVERT_sat +#define __CONVERT_MODE_int_ushort __CONVERT_sat +#define __CONVERT_MODE_int_short __CONVERT_sat +#define __CONVERT_MODE_int_int __NO_CONVERT +#define __CONVERT_MODE_int_float __CONVERT +#define __CONVERT_MODE_int_double __CONVERT +#define __CONVERT_MODE_float_uchar __CONVERT_sat_rte +#define __CONVERT_MODE_float_char __CONVERT_sat_rte +#define __CONVERT_MODE_float_ushort __CONVERT_sat_rte +#define __CONVERT_MODE_float_short __CONVERT_sat_rte +#define __CONVERT_MODE_float_int __CONVERT_rte +#define __CONVERT_MODE_float_float __NO_CONVERT +#define __CONVERT_MODE_float_double __CONVERT +#define __CONVERT_MODE_double_uchar __CONVERT_sat_rte +#define __CONVERT_MODE_double_char __CONVERT_sat_rte +#define __CONVERT_MODE_double_ushort __CONVERT_sat_rte +#define __CONVERT_MODE_double_short __CONVERT_sat_rte +#define __CONVERT_MODE_double_int __CONVERT_rte +#define __CONVERT_MODE_double_float __CONVERT +#define __CONVERT_MODE_double_double __NO_CONVERT +#define __CONVERT_MODE(srcType, dstType) CAT(__CONVERT_MODE_, CAT(REF(T1(srcType)), CAT(_, REF(T1(dstType))))) + +#define __ROUND_MODE__NO_CONVERT +#define __ROUND_MODE__CONVERT // nothing +#define __ROUND_MODE__CONVERT_rte _rte +#define __ROUND_MODE__CONVERT_sat _sat +#define __ROUND_MODE__CONVERT_sat_rte _sat_rte +#define ROUND_MODE(srcType, dstType) CAT(__ROUND_MODE_, __CONVERT_MODE(srcType, dstType)) + +#define __CONVERT_ROUND(dstType, roundMode) CAT(CAT(convert_, REF(dstType)), roundMode) +#define __NO_CONVERT(dstType) // nothing +#define __CONVERT(dstType) __CONVERT_ROUND(dstType,) +#define __CONVERT_rte(dstType) __CONVERT_ROUND(dstType,_rte) +#define __CONVERT_sat(dstType) __CONVERT_ROUND(dstType,_sat) +#define __CONVERT_sat_rte(dstType) __CONVERT_ROUND(dstType,_sat_rte) +#define CONVERT(srcType, dstType) REF(__CONVERT_MODE(srcType,dstType))(dstType) +#define CONVERT_TO(dstType) __CONVERT_ROUND(dstType,) + +// OpenCV depths +#define CV_8U 0 +#define CV_8S 1 +#define CV_16U 2 +#define CV_16S 3 +#define CV_32S 4 +#define CV_32F 5 +#define CV_64F 6 + +// +// End of common preprocessors macro +// + + + +#if defined(DEFINE_feed) + +#define workType TYPE(weight_T1, src_CN) +#define convertSrcToWorkType CONVERT_TO(workType) +#define convertWorkTypeToDstType CONVERT(workType, dst_T) + +__kernel void feed( + DECLARE_MAT_ARG(src), DECLARE_MAT_ARG(weight), + DECLARE_MAT_ARG(dst), DECLARE_MAT_ARG(dstWeight) +) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x < srcWidth && y < srcHeight) + { + int src_byteOffset = MAT_BYTE_OFFSET(src, x, y); + int weight_byteOffset = MAT_BYTE_OFFSET(weight, x, y); + int dst_byteOffset = MAT_BYTE_OFFSET(dst, x, y); + int dstWeight_byteOffset = MAT_BYTE_OFFSET(dstWeight, x, y); + + weight_T w = LOAD_MAT_AT(weight, weight_byteOffset); + workType src_value = convertSrcToWorkType(LOAD_MAT_AT(src, src_byteOffset)); + STORE_MAT_AT(dst, dst_byteOffset, LOAD_MAT_AT(dst, dst_byteOffset) + convertWorkTypeToDstType(src_value * w)); + STORE_MAT_AT(dstWeight, dstWeight_byteOffset, LOAD_MAT_AT(dstWeight, dstWeight_byteOffset) + w); + } +} + +#endif + +#if defined(DEFINE_normalizeUsingWeightMap) + +#define workType TYPE(weight_T1, mat_CN) +#define convertSrcToWorkType CONVERT_TO(workType) +#define convertWorkTypeToDstType CONVERT(workType, mat_T) + +#if weight_DEPTH >= CV_32F +#define WEIGHT_EPS 1e-5f +#else +#define WEIGHT_EPS 0 +#endif + +__kernel void normalizeUsingWeightMap( + DECLARE_MAT_ARG(mat), DECLARE_MAT_ARG(weight) +) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x < matWidth && y < matHeight) + { + int mat_byteOffset = MAT_BYTE_OFFSET(mat, x, y); + int weight_byteOffset = MAT_BYTE_OFFSET(weight, x, y); + + weight_T w = LOAD_MAT_AT(weight, weight_byteOffset); + workType value = convertSrcToWorkType(LOAD_MAT_AT(mat, mat_byteOffset)); + value = value / (w + WEIGHT_EPS); + STORE_MAT_AT(mat, mat_byteOffset, convertWorkTypeToDstType(value)); + } +} + +#endif