From 0ad03162dff9dc500fb1d80cdf33e7c9e00cc961 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 24 Sep 2013 13:34:55 +0400 Subject: [PATCH] refactored and extended arithm operations add/sub/mul/div/absdiff --- modules/ocl/include/opencv2/ocl/ocl.hpp | 19 +- modules/ocl/src/arithm.cpp | 427 +++--------- modules/ocl/src/opencl/arithm_add.cl | 806 ++--------------------- modules/ocl/src/opencl/arithm_add_mask.cl | 79 +++ modules/ocl/src/opencl/arithm_add_scalar.cl | 458 +------------ modules/ocl/src/opencl/arithm_add_scalar_mask.cl | 563 +--------------- modules/ocl/src/opencl/arithm_div.cl | 468 ------------- modules/ocl/src/opencl/arithm_mul.cl | 303 --------- 8 files changed, 287 insertions(+), 2836 deletions(-) create mode 100644 modules/ocl/src/opencl/arithm_add_mask.cl delete mode 100644 modules/ocl/src/opencl/arithm_div.cl delete mode 100644 modules/ocl/src/opencl/arithm_mul.cl diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 361e292..2bfc7db 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -409,40 +409,37 @@ namespace cv CV_EXPORTS void split(const oclMat &src, vector &dst); ////////////////////////////// Arithmetics /////////////////////////////////// + //#if defined DOUBLE_SUPPORT //typedef double F; //#else //typedef float F; //#endif + // CV_EXPORTS void addWeighted(const oclMat& a,F alpha, const oclMat& b,F beta,F gama, oclMat& c); CV_EXPORTS void addWeighted(const oclMat &a, double alpha, const oclMat &b, double beta, double gama, oclMat &c); + //! adds one matrix to another (c = a + b) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 - CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c); - //! adds one matrix to another (c = a + b) - // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 - CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask); + CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask = oclMat()); //! adds scalar to a matrix (c = a + s) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat()); + //! subtracts one matrix from another (c = a - b) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 - CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c); - //! subtracts one matrix from another (c = a - b) - // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 - CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask); + CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask = oclMat()); //! subtracts scalar from a matrix (c = a - s) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat()); - //! subtracts scalar from a matrix (c = a - s) - // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 - CV_EXPORTS void subtract(const Scalar &sc, const oclMat &a, oclMat &c, const oclMat &mask = oclMat()); + //! computes element-wise product of the two arrays (c = a * b) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1); //! multiplies matrix to a number (dst = scalar * src) // supports CV_32FC1 only CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst); + //! computes element-wise quotient of the two arrays (c = a / b) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1); diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 0cc803d..03c314c 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -62,11 +62,11 @@ namespace cv { namespace ocl { - ////////////////////////////////OpenCL kernel strings///////////////////// + //////////////////////////////// OpenCL kernel strings ///////////////////// + extern const char *transpose_kernel; extern const char *arithm_nonzero; extern const char *arithm_sum; - extern const char *arithm_2_mat; extern const char *arithm_sum_3; extern const char *arithm_minMax; extern const char *arithm_minMax_mask; @@ -74,6 +74,7 @@ namespace cv extern const char *arithm_minMaxLoc_mask; extern const char *arithm_LUT; extern const char *arithm_add; + extern const char *arithm_add_mask; extern const char *arithm_add_scalar; extern const char *arithm_add_scalar_mask; extern const char *arithm_bitwise_binary; @@ -83,9 +84,7 @@ namespace cv extern const char *arithm_bitwise_not; extern const char *arithm_compare_eq; extern const char *arithm_compare_ne; - extern const char *arithm_mul; - extern const char *arithm_div; - extern const char *arithm_absdiff; + extern const char *arithm_magnitudeSqr; extern const char *arithm_transpose; extern const char *arithm_flip; extern const char *arithm_flip_rc; @@ -97,390 +96,176 @@ namespace cv extern const char *arithm_addWeighted; extern const char *arithm_phase; extern const char *arithm_pow; - extern const char *arithm_magnitudeSqr; extern const char *arithm_setidentity; - //extern const char * jhp_transpose_kernel; - int64 kernelrealtotal = 0; - int64 kernelalltotal = 0; - int64 reducetotal = 0; - int64 downloadtotal = 0; - int64 alltotal = 0; } } - ////////////////////////////////////////////////////////////////////////////// /////////////////////// add subtract multiply divide ///////////////////////// ////////////////////////////////////////////////////////////////////////////// -template -void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, - string kernelName, const char **kernelString, void *_scalar, int op_type = 0) -{ - if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) - { - CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); - return; - } - - dst.create(src1.size(), src1.type()); - CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && - src1.rows == src2.rows && src2.rows == dst.rows); - CV_Assert(src1.type() == src2.type() && src1.type() == dst.type()); - CV_Assert(src1.depth() != CV_8S); - - Context *clCxt = src1.clCxt; - int channels = dst.oclchannels(); - int depth = dst.depth(); - - int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1}, - {4, 0, 4, 4, 1, 1, 1}, - {4, 0, 4, 4, 1, 1, 1}, - {4, 0, 4, 4, 1, 1, 1} - }; - - size_t vector_length = vector_lengths[channels - 1][depth]; - int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); - int cols = divUp(dst.cols * channels + offset_cols, vector_length); +////////////////////////////////////////////////////////////////////////////// +/////////////////////// add subtract multiply divide ///////////////////////// +////////////////////////////////////////////////////////////////////////////// - size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { cols, dst.rows, 1 }; +enum { ADD = 0, SUB, MUL, DIV, ABS_DIFF }; - int dst_step1 = dst.cols * dst.elemSize(); - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - T scalar; - if(_scalar != NULL) - { - double scalar1 = *((double *)_scalar); - scalar = (T)scalar1; - args.push_back( make_pair( sizeof(T), (void *)&scalar )); - } - switch(op_type) - { - case MAT_ADD: - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_ADD"); - break; - case MAT_SUB: - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_SUB"); - break; - default: - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); - } -} -static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, - string kernelName, const char **kernelString, int op_type = 0) +static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const Scalar & scalar, const oclMat & mask, + oclMat &dst, int op_type, bool use_scalar = false) { - arithmetic_run(src1, src2, dst, kernelName, kernelString, (void *)NULL, op_type); -} -static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, - string kernelName, const char **kernelString, int op_type = 0) -{ - if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) + Context *clCxt = src1.clCxt; + bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE); + if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F)) { - CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); + CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; } - dst.create(src1.size(), src1.type()); - CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && - src1.rows == src2.rows && src2.rows == dst.rows && - src1.rows == mask.rows && src1.cols == mask.cols); - - CV_Assert(src1.type() == src2.type() && src1.type() == dst.type()); - CV_Assert(src1.depth() != CV_8S); - CV_Assert(mask.type() == CV_8U); + CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size())); + CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size())); + CV_Assert(op_type >= ADD && op_type <= ABS_DIFF); - Context *clCxt = src1.clCxt; - int channels = dst.oclchannels(); - int depth = dst.depth(); + dst.create(src1.size(), src1.type()); - int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1}, - {2, 2, 1, 1, 1, 1, 1}, - {4, 4, 2, 2 , 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1} - }; + int oclChannels = src1.oclchannels(), depth = src1.depth(); + int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize(); + int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize(); + int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize(); + int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize(); + oclMat m; - size_t vector_length = vector_lengths[channels - 1][depth]; - int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1); - int cols = divUp(dst.cols + offset_cols, vector_length); + size_t localThreads[3] = { 16, 16, 1 }; + size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; - size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { cols, dst.rows, 1 }; + std::string kernelName = op_type == ABS_DIFF ? "arithm_absdiff" : "arithm_binary_op"; + + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + const char * const WTypeMap[] = { "short", "short", "int", "int", "int", "float", "double" }; + const char operationsMap[] = { '+', '-', '*', '/', '-' }; + const char * const channelMap[] = { "", "", "2", "4", "4" }; + bool haveScalar = use_scalar || src2.empty(); + + int WDepth = depth; + if (haveScalar) + WDepth = hasDouble && WDepth == CV_64F ? CV_64F : CV_32F; + if (op_type == DIV) + WDepth = hasDouble ? CV_64F : CV_32F; + else if (op_type == MUL) + WDepth = hasDouble && (depth == CV_32S || depth == CV_64F) ? CV_64F : CV_32F; + + std::string buildOptions = format("-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D Operation=%c" + " -D convertToWT=convert_%s%s", + typeMap[depth], channelMap[oclChannels], + WTypeMap[WDepth], channelMap[oclChannels], + typeMap[depth], channelMap[oclChannels], (depth >= CV_32F ? "" : (depth == CV_32S ? "_rte" : "_sat_rte")), + operationsMap[op_type], WTypeMap[WDepth], channelMap[oclChannels]); - int dst_step1 = dst.cols * dst.elemSize(); vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1 )); - switch (op_type) + if (!src2.empty()) { - case MAT_ADD: - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD"); - break; - case MAT_SUB: - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB"); - break; - default: - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth); - } -} -void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst) -{ - arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_ADD); -} -void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) -{ - arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_ADD); -} - -void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst) -{ - arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_SUB); -} -void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) -{ - arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_SUB); -} -typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, - const char **kernelString, void *scalar); - -void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) -{ - if(src1.clCxt->supportsFeature(Context::CL_DOUBLE) && (src1.depth() == CV_64F)) - arithmetic_run(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar)); - else - arithmetic_run(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar)); -} - -void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) -{ - - if(src1.clCxt->supportsFeature(Context::CL_DOUBLE)) - arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); - else - arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1 )); -} -template -void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar) -{ - if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) - { - CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); - return; + kernelName += "_mat"; } - dst.create(src1.size(), src1.type()); - - CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows && - src1.type() == dst.type()); - - //CV_Assert(src1.depth() != CV_8S); - - if(mask.data) + if (haveScalar) { - CV_Assert(mask.type() == CV_8U && src1.rows == mask.rows && src1.cols == mask.cols); - } - - Context *clCxt = src1.clCxt; - int channels = dst.oclchannels(); - int depth = dst.depth(); - - WT s[4] = { saturate_cast(src2.val[0]), saturate_cast(src2.val[1]), - saturate_cast(src2.val[2]), saturate_cast(src2.val[3]) - }; + const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F }; + m.create(1, 1, CV_MAKE_TYPE(WDepthMap[WDepth], oclChannels)); + m.setTo(scalar); - int vector_lengths[4][7] = {{4, 0, 2, 2, 1, 1, 1}, - {2, 0, 1, 1, 1, 1, 1}, - {4, 0, 2, 2 , 1, 1, 1}, - {1, 0, 1, 1, 1, 1, 1} - }; - - size_t vector_length = vector_lengths[channels - 1][depth]; - int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1); - int cols = divUp(dst.cols + offset_cols, vector_length); - - size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { cols, dst.rows, 1 }; + args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data )); - int dst_step1 = dst.cols * dst.elemSize(); - vector > args; - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src1.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.offset)); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.offset)); - - if(mask.data) - { - args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset)); + kernelName += "_scalar"; } - args.push_back( make_pair( sizeof(CL_WT) , (void *)&s )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step1 )); - if(isMatSubScalar != 0) - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB"); - else - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD"); -} -static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString, double scalar) -{ - if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F) + if (!mask.empty()) { - CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); - return; - } - - dst.create(src.size(), src.type()); - CV_Assert(src.cols == dst.cols && src.rows == dst.rows); - - CV_Assert(src.type() == dst.type()); - CV_Assert(src.depth() != CV_8S); - - Context *clCxt = src.clCxt; - int channels = dst.oclchannels(); - int depth = dst.depth(); - - int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1}, - {4, 0, 4, 4, 1, 1, 1}, - {4, 0, 4, 4 , 1, 1, 1}, - {4, 0, 4, 4, 1, 1, 1} - }; + args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&maskstep1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&maskoffset1 )); - size_t vector_length = vector_lengths[channels - 1][depth]; - int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); - int cols = divUp(dst.cols * channels + offset_cols, vector_length); + kernelName += "_mask"; + } - size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { cols, dst.rows, 1 }; + if (op_type == DIV) + kernelName += "_div"; - int dst_step1 = dst.cols * dst.elemSize(); - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset )); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 )); - float f_scalar = (float)scalar; - if(src.clCxt->supportsFeature(Context::CL_DOUBLE)) - args.push_back( make_pair( sizeof(cl_double), (void *)&scalar )); - else - { - args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar)); - } + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(clCxt, mask.empty() ? + (!src2.empty() ? &arithm_add : &arithm_add_scalar) : + (!src2.empty() ? &arithm_add_mask : &arithm_add_scalar_mask), + kernelName, globalThreads, localThreads, + args, -1, -1, buildOptions.c_str()); } -typedef void (*ArithmeticFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar); - - -static void arithmetic_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar) -{ - static ArithmeticFuncS tab[8] = - { - arithmetic_scalar_run, - arithmetic_scalar_run, - arithmetic_scalar_run, - arithmetic_scalar_run, - arithmetic_scalar_run, - arithmetic_scalar_run, - arithmetic_scalar_run, - 0 - }; - ArithmeticFuncS func = tab[src1.depth()]; - if(func == 0) - cv::ocl::error("Unsupported arithmetic operation", __FILE__, __LINE__); - func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar); -} -static void arithmetic_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString) +void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) { - arithmetic_scalar(src1, src2, dst, mask, kernelName, kernelString, 0); + arithmetic_run_generic(src1, src2, Scalar(), mask, dst, ADD); } void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) { - string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add"; - const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar; + arithmetic_run_generic(src1, oclMat(), src2, mask, dst, ADD); +} - arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString); +void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) +{ + arithmetic_run_generic(src1, src2, Scalar(), mask, dst, SUB); } void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) { - string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add"; - const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar; - arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1); + arithmetic_run_generic(src1, oclMat(), src2, mask, dst, SUB); } -void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask) + +void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) { - string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add"; - const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar; - arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1); + const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits::epsilon()); + arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, MUL, use_scalar); } + void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst) { - string kernelName = "arithm_muls"; - arithmetic_scalar_run( src, dst, kernelName, &arithm_mul, scalar); + arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, MUL); } -void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst) + +void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) { - if(!src.clCxt->supportsFeature(Context::CL_DOUBLE)) - { - CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); - return; - } + const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits::epsilon()); + arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, DIV, use_scalar); +} - string kernelName = "arithm_s_div"; - arithmetic_scalar_run(src, dst, kernelName, &arithm_div, scalar); +void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst) +{ + arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, DIV); } + ////////////////////////////////////////////////////////////////////////////// ///////////////////////////////// Absdiff /////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// + void cv::ocl::absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst) { - arithmetic_run(src1, src2, dst, "arithm_absdiff", &arithm_absdiff); + arithmetic_run_generic(src1, src2, Scalar(), oclMat(), dst, ABS_DIFF); } + void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst) { - string kernelName = "arithm_s_absdiff"; - oclMat mask; - arithmetic_scalar( src1, src2, dst, mask, kernelName, &arithm_absdiff); + arithmetic_run_generic(src1, oclMat(), src2, oclMat(), dst, ABS_DIFF); } + ////////////////////////////////////////////////////////////////////////////// ///////////////////////////////// compare /////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl index 070ced4..38834e7 100644 --- a/modules/ocl/src/opencl/arithm_add.cl +++ b/modules/ocl/src/opencl/arithm_add.cl @@ -52,809 +52,105 @@ #endif #endif -#ifdef ARITHM_ADD - #define ARITHM_OP(A,B) ((A)+(B)) -#elif defined ARITHM_SUB - #define ARITHM_OP(A,B) ((A)-(B)) -#endif ////////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////ADD//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************add without mask**************************************/ -__kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - uchar4 src1_data = vload4(0, src1 + src1_index_fix); - uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_index)) = dst_data; - } -} -__kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data)); - ushort4 tmp_data = convert_ushort4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} -__kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data)); - short4 tmp_data = convert_short4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - -__kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data1 = *((__global int *)((__global char *)src1 + src1_index)); - int data2 = *((__global int *)((__global char *)src2 + src2_index)); - long tmp = ARITHM_OP((long)(data1), (long)(data2)); - - *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp); - } -} -__kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data1 = *((__global float *)((__global char *)src1 + src1_index)); - float data2 = *((__global float *)((__global char *)src2 + src2_index)); - float tmp = ARITHM_OP(data1, data2); - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double data1 = *((__global double *)((__global char *)src1 + src1_index)); - double data2 = *((__global double *)((__global char *)src2 + src2_index)); - - *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2); - } -} -#endif - -/**************************************add with mask**************************************/ -__kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - int mask_index_fix = mask_index < 0 ? 0 : mask_index; - uchar4 src1_data = vload4(0, src1 + src1_index_fix); - uchar4 src2_data = vload4(0, src2 + src2_index_fix); - uchar4 mask_data = vload4(0, mask + mask_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - if(mask_index < 0) - { - uchar4 tmp; - tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx; - mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw; - } - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data)); - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data)); - short2 tmp_data = convert_short2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2)); - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = *((__global float *)((__global char *)src2 + src2_index)); - float dst_data = *((__global float *)((__global char *)dst + dst_index)); - - float data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global float *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src_data2 = *((__global double *)((__global char *)src2 + src2_index)); - double dst_data = *((__global double *)((__global char *)dst + dst_index)); - - double data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2)); - ushort2 data = convert_ushort2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2)); - short2 data = convert_short2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2))); - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index)); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global float2 *)((__global char *)dst + dst_index)) = data; - } -} +///////////////////////////////////////////// ADD //////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////////// -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_offset, + __global T *src2, int src2_step, int src2_offset, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); + int src1_index = mad24(y, src1_step, x + src1_offset); + int src2_index = mad24(y, src2_step, x + src2_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index)); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; + dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index])); } } -#endif -__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_binary_op_mat_div(__global T *src1, int src1_step, int src1_offset, + __global T *src2, int src2_step, int src2_offset, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); + int src1_index = mad24(y, src1_step, x + src1_offset); + int src2_index = mad24(y, src2_step, x + src2_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2))); - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; + T zero = (T)(0); + dst[dst_index] = src2[src2_index] == zero ? zero : convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index])); } } -__kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2))); - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_absdiff_mat(__global T *src1, int src1_step, int src1_offset, + __global T *src2, int src2_step, int src2_offset, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2))); - data = mask_data ? data : dst_data; + int src1_index = mad24(y, src1_step, x + src1_offset); + int src2_index = mad24(y, src2_step, x + src2_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - *((__global short4 *)((__global char *)dst + dst_index)) = data; + WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]); + value = value > (WT)(0) ? value : -value; + dst[dst_index] = convertToT(value); } } -__kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2))); - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +// add mat with scale for multiply +__kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset, + __global T *src2, int src2_step, int src2_offset, + __global WT *scalar, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); + int src1_index = mad24(y, src1_step, x + src1_offset); + int src2_index = mad24(y, src2_step, x + src2_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - uchar mask_data = *(mask + mask_index); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index)); - float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - - float4 data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global float4 *)((__global char *)dst + dst_index)) = data; + dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index])); } } -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +// add mat with scale for divide +__kernel void arithm_binary_op_mat_scalar_div(__global T *src1, int src1_step, int src1_offset, + __global T *src2, int src2_step, int src2_offset, + __global WT *scalar, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 5) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); + int src1_index = mad24(y, src1_step, x + src1_offset); + int src2_index = mad24(y, src2_step, x + src2_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - uchar mask_data = *(mask + mask_index); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index)); - double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - - double4 data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global double4 *)((__global char *)dst + dst_index)) = data; + T zero = (T)(0); + dst[dst_index] = src2[src2_index] == zero ? zero : + convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index])); } } -#endif diff --git a/modules/ocl/src/opencl/arithm_add_mask.cl b/modules/ocl/src/opencl/arithm_add_mask.cl new file mode 100644 index 0000000..52dbfc4 --- /dev/null +++ b/modules/ocl/src/opencl/arithm_add_mask.cl @@ -0,0 +1,79 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jia Haipeng, jiahaipeng95@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif +#endif + +////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////// add with mask ////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// + +__kernel void arithm_binary_op_mat_mask(__global T * src1, int src1_step, int src1_offset, + __global T * src2, int src2_step, int src2_offset, + __global uchar * mask, int mask_step, int mask_offset, + __global T * dst, int dst_step, int dst_offset, + int cols, int rows) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int mask_index = mad24(y, mask_step, x + mask_offset); + if (mask[mask_index]) + { + int src1_index = mad24(y, src1_step, x + src1_offset); + int src2_index = mad24(y, src2_step, x + src2_offset); + int dst_index = mad24(y, dst_step, dst_offset + x); + + dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index])); + } + } +} diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl index cdb79f3..4e0c7fc 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar.cl @@ -51,463 +51,61 @@ #endif #endif -#ifdef ARITHM_ADD - #define ARITHM_OP(A,B) ((A)+(B)) -#elif defined ARITHM_SUB - #define ARITHM_OP(A,B) ((A)-(B)) -#endif -/**************************************add with scalar without mask**************************************/ -__kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - uchar4 src1_data = vload4(0, src1 + src1_index_fix); - int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - - int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); - short2 tmp_data = convert_short2_sat(tmp); - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2)); - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = src2.x; - float dst_data = *((__global float *)((__global char *)dst + dst_index)); - - float data = ARITHM_OP(src_data1, src_data2); - - *((__global float *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src2_data = src2.x; - double dst_data = *((__global double *)((__global char *)dst + dst_index)); - - double data = ARITHM_OP(src_data1, src2_data); - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; - data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); - ushort2 data = convert_ushort2_sat(tmp); +/////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////// Add with scalar ///////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////// - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset, + __global WT *scalar, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); + int src1_index = mad24(y, src1_step, x + src1_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); - short2 data = convert_short2_sat(tmp); - - *((__global short2 *)((__global char *)dst + dst_index)) = data; + dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]); } } -__kernel void arithm_s_add_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2))); - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_absdiff_scalar(__global T *src1, int src1_step, int src1_offset, + __global WT *src2, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = (float2)(src2.x, src2.y); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); + int src1_index = mad24(y, src1_step, x + src1_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - float2 data = ARITHM_OP(src_data1, src_data2); - *((__global float2 *)((__global char *)dst + dst_index)) = data; + WT value = convertToWT(src1[src1_index]) - src2[0]; + value = value > (WT)(0) ? value : -value; + dst[dst_index] = convertToT(value); } } -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1) +// scalar divide to matrix +__kernel void arithm_binary_op_scalar_div(__global T *src1, int src1_step, int src1_offset, + __global WT *scalar, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = (double2)(src2.x, src2.y); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); + int src1_index = mad24(y, src1_step, x + src1_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); - double2 data = ARITHM_OP(src_data1, src_data2); - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_s_add_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - - uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); - - *((__global uchar4 *)(dst + dst_index)) = data; + T zero = (T)(0); + dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index])); } } -__kernel void arithm_s_add_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - - ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - - short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - - int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2))); - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - - float4 data = ARITHM_OP(src_data1, src2); - - *((__global float4 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - - double4 data = ARITHM_OP(src_data1, src2); - - *((__global double4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl index a0cb7da..5c34080 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl @@ -51,561 +51,28 @@ #endif #endif -#ifdef ARITHM_ADD - #define ARITHM_OP(A,B) ((A)+(B)) -#elif defined ARITHM_SUB - #define ARITHM_OP(A,B) ((A)-(B)) -#endif -/**************************************add with scalar with mask**************************************/ -__kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int mask_index_fix = mask_index < 0 ? 0 : mask_index; - uchar4 src1_data = vload4(0, src1 + src1_index_fix); - int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x); - uchar4 mask_data = vload4(0, mask + mask_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(mask_index < 0) - { - uchar4 tmp; - tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx; - mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw; - } - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); - ushort2 tmp_data = convert_ushort2_sat(tmp); +/////////////////////////////////////////////////////////////////////////////////// +//////////////////////////// Add with scalar with mask //////////////////////////// +/////////////////////////////////////////////////////////////////////////////////// - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); - short2 tmp_data = convert_short2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset, + __global WT *scalar, + __global uchar *mask, int mask_step, int mask_offset, + __global T *dst, int dst_step, int dst_offset, + int cols, int rows) { - int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2)); - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_s_add_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = src2.x; - float dst_data = *((__global float *)((__global char *)dst + dst_index)); - - float data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global float *)((__global char *)dst + dst_index)) = data; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src_data2 = src2.x; - double dst_data = *((__global double *)((__global char *)dst + dst_index)); - - double data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset / 2) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); - ushort2 data = convert_ushort2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); - short2 data = convert_short2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2))); - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = (float2)(src2.x, src2.y); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global float2 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = (double2)(src2.x, src2.y); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = ARITHM_OP(src_data1, src_data2); - data = mask_data ? data : dst_data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2))); - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_add_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - - float4 data = ARITHM_OP(src_data1, src2); - data = mask_data ? data : dst_data; - - *((__global float4 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - - double4 data = ARITHM_OP(src_data1, src2); - data = mask_data ? data : dst_data; + int mask_index = mad24(y, mask_step, x + mask_offset); + if (mask[mask_index]) + { + int src1_index = mad24(y, src1_step, x + src1_offset); + int dst_index = mad24(y, dst_step, dst_offset + x); - *((__global double4 *)((__global char *)dst + dst_index)) = data; + dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]); + } } } -#endif diff --git a/modules/ocl/src/opencl/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl deleted file mode 100644 index 1dce385..0000000 --- a/modules/ocl/src/opencl/arithm_div.cl +++ /dev/null @@ -1,468 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -typedef double F ; -typedef double4 F4; -#define convert_F4 convert_double4 -#define convert_F double -#else -typedef float F; -typedef float4 F4; -#define convert_F4 convert_float4 -#define convert_F float -#endif - -inline uchar round2_uchar(F v) -{ - return convert_uchar_sat(round(v)); -} - -inline ushort round2_ushort(F v) -{ - return convert_ushort_sat(round(v)); -} - -inline short round2_short(F v) -{ - return convert_short_sat(round(v)); -} - -inline int round2_int(F v) -{ - return convert_int_sat(round(v)); -} -/////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////divide/////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////////////////////// -/**********************************div*********************************************/ -__kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int2 coor = (int2)(get_global_id(0), get_global_id(1)); - - if (coor.x < cols && coor.y < rows) - { - coor.x = coor.x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align), - mad24(coor.y, src2_step, coor.x + src2_offset - dst_align)); - - int4 dst_args = (int4)(mad24(coor.y, dst_step, dst_offset), - mad24(coor.y, dst_step, dst_offset + dst_step1), - mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc), - 0); - - uchar4 src1_data = vload4(0, src1 + src_index.x); - uchar4 src2_data = vload4(0, src2 + src_index.y); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_args.z)); - - F4 tmp = convert_F4(src1_data) * scalar; - uchar4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w); - - dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_args.z)) = dst_data; - } -} - -__kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - F4 tmp = convert_F4(src1_data) * scalar; - - ushort4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} -__kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - F4 tmp = convert_F4(src1_data) * scalar; - - short4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w); - - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - -__kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data1 = *((__global int *)((__global char *)src1 + src1_index)); - int data2 = *((__global int *)((__global char *)src2 + src2_index)); - - F tmp = (convert_F)(data1) * scalar; - int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2)); - - *((__global int *)((__global char *)dst + dst_index)) =tmp_data; - } -} - -__kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data1 = *((__global float *)((__global char *)src1 + src1_index)); - float data2 = *((__global float *)((__global char *)src2 + src2_index)); - - F tmp = (convert_F)(data1) * scalar; - float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2)); - - *((__global float *)((__global char *)dst + dst_index)) = tmp_data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double data1 = *((__global double *)((__global char *)src1 + src1_index)); - double data2 = *((__global double *)((__global char *)src2 + src2_index)); - - double tmp = data1 * scalar; - double tmp_data = (tmp == 0 || data2 == 0) ? 0 : (tmp / data2); - - *((__global double *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#endif -/************************************div with scalar************************************/ -__kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src_index = mad24(y, src_step, x + src_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src_data = vload4(0, src + src_index); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_index)) = dst_data; - } -} - -__kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - ushort4 src_data = vload4(0, (__global ushort *)((__global char *)src + src_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} -__kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - short4 src_data = vload4(0, (__global short *)((__global char *)src + src_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w); - - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - -__kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src_index = mad24(y, src_step, (x << 2) + src_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data = *((__global int *)((__global char *)src + src_index)); - - int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data)); - - *((__global int *)((__global char *)dst + dst_index)) =tmp_data; - } -} - -__kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, F scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src_index = mad24(y, src_step, (x << 2) + src_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data = *((__global float *)((__global char *)src + src_index)); - - float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data)); - - *((__global float *)((__global char *)dst + dst_index)) = tmp_data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src_index = mad24(y, src_step, (x << 3) + src_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double data = *((__global double *)((__global char *)src + src_index)); - - double tmp_data = (scalar == 0 || data == 0) ? 0 : (scalar / data); - - *((__global double *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl deleted file mode 100644 index bfbb594..0000000 --- a/modules/ocl/src/opencl/arithm_mul.cl +++ /dev/null @@ -1,303 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -int4 round_int4(float4 v) -{ - v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); - v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); - v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); - v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5); - - return convert_int4_sat(v); -} -uint4 round_uint4(float4 v) -{ - v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); - v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); - v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); - v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5); - - return convert_uint4_sat(v); -} -long round_int(float v) -{ - v = v + (v > 0 ? 0.5 : -0.5); - - return convert_int_sat(v); -} -////////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////multiply////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************add without mask**************************************/ -__kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, float scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data ,src2_data; - - src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; - src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; - src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; - src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; - - src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; - src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; - src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; - src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; - - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data); - tmp = round_int4(convert_float4(tmp) * scalar); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_index)) = dst_data; - } -} -__kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, float scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - uint4 tmp = convert_uint4_sat(src1_data) * convert_uint4_sat(src2_data); - tmp = round_uint4(convert_float4(tmp) * scalar); - ushort4 tmp_data = convert_ushort4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} -__kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, float scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data); - tmp = round_int4(convert_float4(tmp) * scalar); - short4 tmp_data = convert_short4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - -__kernel void arithm_mul_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, float scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data1 = *((__global int *)((__global char *)src1 + src1_index)); - int data2 = *((__global int *)((__global char *)src2 + src2_index)); - int tmp = data1 * data2; - tmp = round_int((float)tmp * scalar); - - *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp); - } -} -__kernel void arithm_mul_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, float scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data1 = *((__global float *)((__global char *)src1 + src1_index)); - float data2 = *((__global float *)((__global char *)src2 + src2_index)); - float tmp = data1 * data2; - tmp = tmp * scalar; - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double data1 = *((__global double *)((__global char *)src1 + src1_index)); - double data2 = *((__global double *)((__global char *)src2 + src2_index)); - - double tmp = data1 * data2; - tmp = tmp * scalar; - - *((__global double *)((__global char *)dst + dst_index)) = tmp; - } -} -#endif - -#ifdef DOUBLE_SUPPORT -#define SCALAR_TYPE double -#else -#define SCALAR_TYPE float -#endif - -__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, SCALAR_TYPE scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data1 = *((__global float *)((__global char *)src1 + src1_index)); - float tmp = data1 * scalar; - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} -- 2.7.4