From b18101b15aab3611696c14d9cbd9c445f73e328d Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 12 Oct 2013 00:58:58 +0400 Subject: [PATCH] some optimization of binary ocl::bitwise operations --- modules/ocl/src/arithm.cpp | 54 +++++++++++++++------- modules/ocl/src/opencl/arithm_add.cl | 6 +-- modules/ocl/src/opencl/arithm_add_scalar.cl | 12 ++--- modules/ocl/src/opencl/arithm_add_scalar_mask.cl | 8 ++-- modules/ocl/src/opencl/arithm_bitwise_binary.cl | 21 +++++++-- .../ocl/src/opencl/arithm_bitwise_binary_mask.cl | 19 +++++++- .../ocl/src/opencl/arithm_bitwise_binary_scalar.cl | 24 ++++++++-- .../opencl/arithm_bitwise_binary_scalar_mask.cl | 23 +++++++-- 8 files changed, 124 insertions(+), 43 deletions(-) diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 2d54385..5a3820e 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -56,6 +56,23 @@ using namespace cv; using namespace cv::ocl; +static std::vector scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn) +{ + CV_Assert(ocn == cn || (ocn == 4 && cn == 3)); + + static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort), + sizeof(short), sizeof(int), sizeof(float), sizeof(double) }; + + int elemSize1 = sizeMap[depth]; + int bufSize = elemSize1 * ocn; + std::vector _buf(bufSize); + uchar * buf = &_buf[0]; + scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn)); + memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1); + + return _buf; +} + ////////////////////////////////////////////////////////////////////////////// /////////////// add subtract multiply divide min max ///////////////////////// ////////////////////////////////////////////////////////////////////////////// @@ -84,7 +101,7 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize(); int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize(); int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize(); - oclMat m; + std::vector m; size_t localThreads[3] = { 16, 16, 1 }; size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; @@ -132,10 +149,9 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const if (haveScalar) { const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F }; - m.create(1, 1, CV_MAKE_TYPE(WDepthMap[WDepth], oclChannels)); - m.setTo(scalar); + m = scalarToVector(scalar, WDepthMap[WDepth], oclChannels, src1.channels()); - args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data )); + args.push_back( make_pair( m.size(), (void *)&m[0])); kernelName += "_scalar"; } @@ -1329,6 +1345,13 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName enum { AND = 0, OR, XOR }; +static std::string to_string(int value) +{ + std::ostringstream stream; + stream << value; + return stream.str(); +} + static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask, oclMat &dst, int operationType) { @@ -1337,17 +1360,20 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size())); dst.create(src1.size(), src1.type()); - - int elemSize = dst.elemSize(); - int cols1 = dst.cols * elemSize; oclMat m; const char operationMap[] = { '&', '|', '^' }; std::string kernelName("arithm_bitwise_binary"); - std::string buildOptions = format("-D Operation=%c", operationMap[operationType]); + + int vlen = std::min(8, src1.elemSize1() * src1.oclchannels()); + std::string vlenstr = vlen > 1 ? to_string(vlen) : ""; + std::string buildOptions = format("-D Operation=%c -D vloadn=vload%s -D vstoren=vstore%s -D elemSize=%d -D vlen=%d" + " -D ucharv=uchar%s", + operationMap[operationType], vlenstr.c_str(), vlenstr.c_str(), + (int)src1.elemSize(), vlen, vlenstr.c_str()); size_t localThreads[3] = { 16, 16, 1 }; - size_t globalThreads[3] = { cols1, dst.rows, 1 }; + size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); @@ -1360,7 +1386,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca m.setTo(src3); args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize ) ); kernelName += "_scalar"; } @@ -1377,9 +1402,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset )); - if (!src2.empty()) - args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize )); - kernelName += "_mask"; } @@ -1387,7 +1409,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); openCLExecuteKernel(src1.clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) : @@ -1400,12 +1422,12 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst) { if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { - CV_Error(CV_OpenCLDoubleNotSupported, "selected device doesn't support double"); + CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double"); return; } dst.create(src.size(), src.type()); - bitwise_unary_run(src, dst, "arithm_bitwise_not", &arithm_bitwise_not); + bitwise_unary_run(src, dst, "arithm_bitwise_not", &arithm_bitwise_not); } void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl index cd9fae6..2f34bbb 100644 --- a/modules/ocl/src/opencl/arithm_add.cl +++ b/modules/ocl/src/opencl/arithm_add.cl @@ -62,7 +62,7 @@ #if defined (FUNC_MUL) #if defined (HAVE_SCALAR) -#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index])); +#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar * convertToWT(src2[src2_index])); #else #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index])); #endif @@ -72,7 +72,7 @@ #if defined (HAVE_SCALAR) #define EXPRESSION T zero = (T)(0); \ dst[dst_index] = src2[src2_index] == zero ? zero : \ - convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index])); + convertToT(convertToWT(src1[src1_index]) * scalar / convertToWT(src2[src2_index])); #else #define EXPRESSION T zero = (T)(0); \ dst[dst_index] = src2[src2_index] == zero ? zero : \ @@ -123,7 +123,7 @@ __kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_off // add mat with scale __kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset, __global T *src2, int src2_step, int src2_offset, - __global WT *scalar, + WT scalar, __global T *dst, int dst_step, int dst_offset, int cols, int rows) { diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl index 671bd12..7f4e413 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar.cl @@ -52,20 +52,20 @@ #endif #if defined (FUNC_ADD) -#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]); +#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar); #endif #if defined (FUNC_SUB) -#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]); +#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar); #endif #if defined (FUNC_MUL) -#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]); +#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar); #endif #if defined (FUNC_DIV) #define EXPRESSION T zero = (T)(0); \ - dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index])); + dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar / convertToWT(src1[src1_index])); #endif #if defined (FUNC_ABS) @@ -75,7 +75,7 @@ #endif #if defined (FUNC_ABS_DIFF) -#define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar[0]; \ +#define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar; \ value = value > (WT)(0) ? value : -value; \ dst[dst_index] = convertToT(value); #endif @@ -85,7 +85,7 @@ /////////////////////////////////////////////////////////////////////////////////// __kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset, - __global WT *scalar, + WT scalar, __global T *dst, int dst_step, int dst_offset, int cols, int rows) { diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl index d472b3c..b93de0c 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl @@ -52,15 +52,15 @@ #endif #if defined (FUNC_ADD) -#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]); +#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar); #endif #if defined (FUNC_SUB) -#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]); +#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar); #endif #if defined (FUNC_MUL) -#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]); +#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar); #endif #if defined (FUNC_DIV) @@ -74,7 +74,7 @@ /////////////////////////////////////////////////////////////////////////////////// __kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset, - __global WT *scalar, + WT scalar, __global uchar *mask, int mask_step, int mask_offset, __global T *dst, int dst_step, int dst_offset, int cols, int rows) diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary.cl b/modules/ocl/src/opencl/arithm_bitwise_binary.cl index 898b40a..a4fa205 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl @@ -51,17 +51,32 @@ __kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset, __global uchar * src2, int src2_step, int src2_offset, __global uchar * dst, int dst_step, int dst_offset, - int cols1, int rows) + int cols, int rows) { int x = get_global_id(0); int y = get_global_id(1); - if (x < cols1 && y < rows) + if (x < cols && y < rows) { +#if elemSize > 1 + x *= elemSize; +#endif int src1_index = mad24(y, src1_step, x + src1_offset); int src2_index = mad24(y, src2_step, x + src2_offset); - int dst_index = mad24(y, dst_step, dst_offset + x); + int dst_index = mad24(y, dst_step, x + dst_offset); +#if elemSize > 1 + #pragma unroll + for (int i = 0; i < elemSize; i += vlen) + { + ucharv t0 = vloadn(0, src1 + src1_index + i); + ucharv t1 = vloadn(0, src2 + src2_index + i); + ucharv t2 = t0 Operation t1; + + vstoren(t2, 0, dst + dst_index + i); + } +#else dst[dst_index] = src1[src1_index] Operation src2[src2_index]; +#endif } } diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl index 622ab5b..d244e57 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl @@ -50,7 +50,7 @@ __kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset, __global uchar * src2, int src2_step, int src2_offset, - __global uchar * mask, int mask_step, int mask_offset, int elemSize, + __global uchar * mask, int mask_step, int mask_offset, __global uchar * dst, int dst_step, int dst_offset, int cols1, int rows) { @@ -59,15 +59,30 @@ __kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, i if (x < cols1 && y < rows) { - int mask_index = mad24(y, mask_step, mask_offset + (x / elemSize)); + int mask_index = mad24(y, mask_step, mask_offset + x); if (mask[mask_index]) { +#if elemSize > 1 + x *= elemSize; +#endif int src1_index = mad24(y, src1_step, x + src1_offset); int src2_index = mad24(y, src2_step, x + src2_offset); int dst_index = mad24(y, dst_step, x + dst_offset); +#if elemSize > 1 + #pragma unroll + for (int i = 0; i < elemSize; i += vlen) + { + ucharv t0 = vloadn(0, src1 + src1_index + i); + ucharv t1 = vloadn(0, src2 + src2_index + i); + ucharv t2 = t0 Operation t1; + + vstoren(t2, 0, dst + dst_index + i); + } +#else dst[dst_index] = src1[src1_index] Operation src2[src2_index]; +#endif } } } diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl index c17b412..5a7d593 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl @@ -50,19 +50,33 @@ __kernel void arithm_bitwise_binary_scalar( __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int elemSize, + __global uchar *src2, __global uchar *dst, int dst_step, int dst_offset, - int cols1, int rows) + int cols, int rows) { int x = get_global_id(0); int y = get_global_id(1); - if (x < cols1 && y < rows) + if (x < cols && y < rows) { +#if elemSize > 1 + x *= elemSize; +#endif int src1_index = mad24(y, src1_step, src1_offset + x); - int src2_index = x % elemSize; int dst_index = mad24(y, dst_step, dst_offset + x); - dst[dst_index] = src1[src1_index] Operation src2[src2_index]; +#if elemSize > 1 + #pragma unroll + for (int i = 0; i < elemSize; i += vlen) + { + ucharv t0 = vloadn(0, src1 + src1_index + i); + ucharv t1 = vloadn(0, src2 + i); + ucharv t2 = t0 Operation t1; + + vstoren(t2, 0, dst + dst_index + i); + } +#else + dst[dst_index] = src1[src1_index] Operation src2[0]; +#endif } } diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl index bae1699..a1876b5 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl @@ -56,7 +56,7 @@ ////////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int elemSize, + __global uchar *src2, __global uchar *mask, int mask_step, int mask_offset, __global uchar *dst, int dst_step, int dst_offset, int cols, int rows) @@ -66,14 +66,29 @@ __kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_s if (x < cols && y < rows) { - int mask_index = mad24(y, mask_step, (x / elemSize) + mask_offset); + int mask_index = mad24(y, mask_step, x + mask_offset); + if (mask[mask_index]) { +#if elemSize > 1 + x *= elemSize; +#endif int src1_index = mad24(y, src1_step, x + src1_offset); - int src2_index = x % elemSize; int dst_index = mad24(y, dst_step, x + dst_offset); - dst[dst_index] = src1[src1_index] Operation src2[src2_index]; +#if elemSize > 1 + #pragma unroll + for (int i = 0; i < elemSize; i += vlen) + { + ucharv t0 = vloadn(0, src1 + src1_index + i); + ucharv t1 = vloadn(0, src2 + i); + ucharv t2 = t0 Operation t1; + + vstoren(t2, 0, dst + dst_index + i); + } +#else + dst[dst_index] = src1[src1_index] Operation src2[0]; +#endif } } } -- 2.7.4