From f7620dc7d184a8a26a0cd6317566821035c3abc2 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Wed, 12 Feb 2014 12:18:55 +0400 Subject: [PATCH] added 3-channel support to arithmetic operations --- modules/core/src/arithm.cpp | 34 +++++--- modules/core/src/opencl/arithm.cl | 164 ++++++++++++++++++++++++-------------- 2 files changed, 129 insertions(+), 69 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index dbf05a3..706d346 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -934,16 +934,23 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if( oclop < 0 || ((haveMask || haveScalar) && (cn > 4 || cn == 3)) || + if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || (!doubleSupport && srcdepth == CV_64F)) return false; char opts[1024]; int kercn = haveMask || haveScalar ? cn : 1; - sprintf(opts, "-D %s%s -D %s -D dstT=%s%s", + int scalarcn = kercn == 3 ? 4 : kercn; + + sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : - ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), + kercn); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); if( k.empty() ) @@ -960,7 +967,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, if( haveScalar ) { - size_t esz = CV_ELEM_SIZE(srctype); + size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn; double buf[4] = {0,0,0,0}; if( oclop != OCL_OP_NOT ) @@ -1294,7 +1301,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); bool haveMask = !_mask.empty(); - if( ((haveMask || haveScalar) && (cn > 4 || cn == 3)) ) + if( ((haveMask || haveScalar) && cn > 4) ) return false; int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype)); @@ -1307,21 +1314,26 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, return false; int kercn = haveMask || haveScalar ? cn : 1; + int scalarcn = kercn == 3 ? 4 : kercn; char cvtstr[4][32], opts[1024]; - sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s " - "-D dstT=%s -D workT=%s -D scaleT=%s -D convertToWT1=%s " - "-D convertToWT2=%s -D convertToDT=%s%s", + sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " + "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D convertToWT1=%s " + "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), + ocl::typeToStr(CV_MAKETYPE(depth1, 1)), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), + ocl::typeToStr(CV_MAKETYPE(depth2, 1)), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)), + ocl::typeToStr(CV_MAKETYPE(ddepth, 1)), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), + ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)), ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn); size_t usrdata_esz = CV_ELEM_SIZE(wdepth); const uchar* usrdata_p = (const uchar*)usrdata; @@ -1352,7 +1364,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, if( haveScalar ) { - size_t esz = CV_ELEM_SIZE(wtype); + size_t esz = CV_ELEM_SIZE(wtype)*scalarcn; double buf[4]={0,0,0,0}; Mat src2sc = _src2.getMat(); @@ -2621,7 +2633,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" }; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D BINARY_OP -D srcT1=%s -D workT=srcT1" + format("-D BINARY_OP -D srcT1=%s -D workT=srcT1 -D cn=1" " -D OP_CMP -D CMP_OPERATOR=%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), operationMap[op], diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl index ed08384..a7dacc4 100644 --- a/modules/core/src/opencl/arithm.cl +++ b/modules/core/src/opencl/arithm.cl @@ -70,21 +70,47 @@ #define CV_PI M_PI_F #endif -#define dstelem *(__global dstT*)(dstptr + dst_index) -#define dstelem2 *(__global dstT*)(dstptr2 + dst_index2) +#ifndef cn +#define cn 1 +#endif + +#if cn == 1 +#undef srcT1_C1 +#undef srcT2_C1 +#undef dstT_C1 +#define srcT1_C1 srcT1 +#define srcT2_C1 srcT2 +#define dstT_C1 dstT +#endif + +#if cn != 3 + #define storedst(val) *(__global dstT*)(dstptr + dst_index) = val + #define storedst2(val) *(__global dstT*)(dstptr2 + dst_index2) = val +#else + #define storedst(val) vstore3(val, 0, (__global dstT_C1*)(dstptr + dst_index)) + #define storedst2(val) vstore3(val, 0, (__global dstT_C1*)(dstptr2 + dst_index2)) +#endif + #define noconvert #ifndef workT #ifndef srcT1 #define srcT1 dstT + #define srcT1_C1 dstT_C1 #endif #ifndef srcT2 #define srcT2 dstT + #define srcT2_C1 dstT_C1 #endif #define workT dstT - #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index) - #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index) + #if cn != 3 + #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index) + #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index) + #else + #define srcelem1 vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index)) + #define srcelem2 vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index)) + #endif #ifndef convertToDT #define convertToDT noconvert #endif @@ -94,153 +120,168 @@ #ifndef convertToWT2 #define convertToWT2 convertToWT1 #endif - #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index)) - #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index)) + #if cn != 3 + #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index)) + #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index)) + #else + #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index))) + #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index))) + #endif + +#endif +#ifndef workST +#define workST workT #endif #define EXTRA_PARAMS #define EXTRA_INDEX #if defined OP_ADD -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2)) #elif defined OP_SUB -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2)) #elif defined OP_RSUB -#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1) +#define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1)) #elif defined OP_ABSDIFF #define PROCESS_ELEM \ workT v = srcelem1 - srcelem2; \ - dstelem = convertToDT(v >= (workT)(0) ? v : -v); + storedst(convertToDT(v >= (workT)(0) ? v : -v)) #elif defined OP_AND -#define PROCESS_ELEM dstelem = srcelem1 & srcelem2 +#define PROCESS_ELEM storedst(srcelem1 & srcelem2) #elif defined OP_OR -#define PROCESS_ELEM dstelem = srcelem1 | srcelem2 +#define PROCESS_ELEM storedst(srcelem1 | srcelem2) #elif defined OP_XOR -#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2 +#define PROCESS_ELEM storedst(srcelem1 ^ srcelem2) #elif defined OP_NOT -#define PROCESS_ELEM dstelem = ~srcelem1 +#define PROCESS_ELEM storedst(~srcelem1) #elif defined OP_MIN -#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(min(srcelem1, srcelem2)) #elif defined OP_MAX -#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(max(srcelem1, srcelem2)) #elif defined OP_MUL -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2)) #elif defined OP_MUL_SCALE #undef EXTRA_PARAMS #ifdef UNARY_OP -#define EXTRA_PARAMS , workT srcelem2, scaleT scale +#define EXTRA_PARAMS , workST srcelem2_, scaleT scale +#undef srcelem2 +#define srcelem2 srcelem2_ #else #define EXTRA_PARAMS , scaleT scale #endif -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * scale * srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2)) #elif defined OP_DIV #define PROCESS_ELEM \ workT e2 = srcelem2, zero = (workT)(0); \ - dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero) + storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero)) #elif defined OP_DIV_SCALE #undef EXTRA_PARAMS #ifdef UNARY_OP -#define EXTRA_PARAMS , workT srcelem2, scaleT scale +#define EXTRA_PARAMS , workST srcelem2_, scaleT scale +#undef srcelem2 +#define srcelem2 srcelem2_ #else #define EXTRA_PARAMS , scaleT scale #endif #define PROCESS_ELEM \ workT e2 = srcelem2, zero = (workT)(0); \ - dstelem = convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)) + storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2))) #elif defined OP_RDIV_SCALE #undef EXTRA_PARAMS #ifdef UNARY_OP -#define EXTRA_PARAMS , workT srcelem2, scaleT scale +#define EXTRA_PARAMS , workST srcelem2_, scaleT scale +#undef srcelem2 +#define srcelem2 srcelem2_ #else #define EXTRA_PARAMS , scaleT scale #endif #define PROCESS_ELEM \ workT e1 = srcelem1, zero = (workT)(0); \ - dstelem = convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)) + storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1))) #elif defined OP_RECIP_SCALE #undef EXTRA_PARAMS #define EXTRA_PARAMS , scaleT scale #define PROCESS_ELEM \ workT e1 = srcelem1, zero = (workT)(0); \ - dstelem = convertToDT(e1 != zero ? scale / e1 : zero) + storedst(convertToDT(e1 != zero ? scale / e1 : zero)) #elif defined OP_ADDW #undef EXTRA_PARAMS #define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma -#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma) +#define PROCESS_ELEM storedst(convertToDT(srcelem1*alpha + srcelem2*beta + gamma)) #elif defined OP_MAG -#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2)) #elif defined OP_ABS_NOSAT #define PROCESS_ELEM \ dstT v = convertToDT(srcelem1); \ - dstelem = v >= 0 ? v : -v + storedst(v >= 0 ? v : -v) #elif defined OP_PHASE_RADIANS #define PROCESS_ELEM \ workT tmp = atan2(srcelem2, srcelem1); \ if(tmp < 0) tmp += 6.283185307179586232f; \ - dstelem = tmp + storedst(tmp) #elif defined OP_PHASE_DEGREES #define PROCESS_ELEM \ workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465f; \ if(tmp < 0) tmp += 360; \ - dstelem = tmp + storedst(tmp) #elif defined OP_EXP -#define PROCESS_ELEM dstelem = exp(srcelem1) +#define PROCESS_ELEM storedst(exp(srcelem1)) #elif defined OP_POW -#define PROCESS_ELEM dstelem = pow(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(pow(srcelem1, srcelem2)) #elif defined OP_POWN #undef workT #define workT int -#define PROCESS_ELEM dstelem = pown(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2)) #elif defined OP_SQRT -#define PROCESS_ELEM dstelem = sqrt(srcelem1) +#define PROCESS_ELEM storedst(sqrt(srcelem1)) #elif defined OP_LOG #define PROCESS_ELEM \ -dstT v = (dstT)(srcelem1);\ -dstelem = v > (dstT)(0) ? log(v) : log(-v) + dstT v = (dstT)(srcelem1);\ + storedst(v > (dstT)(0) ? log(v) : log(-v)) #elif defined OP_CMP #define dstT uchar #define srcT2 srcT1 #define convertToWT1 -#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0) +#define PROCESS_ELEM storedst(convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)) #elif defined OP_CONVERT_SCALE_ABS #undef EXTRA_PARAMS #define EXTRA_PARAMS , workT alpha, workT beta #define PROCESS_ELEM \ workT value = srcelem1 * alpha + beta; \ - dstelem = convertToDT(value >= 0 ? value : -value) + storedst(convertToDT(value >= 0 ? value : -value)) #elif defined OP_SCALE_ADD #undef EXTRA_PARAMS #define EXTRA_PARAMS , workT alpha -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * alpha + srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 * alpha + srcelem2)) #elif defined OP_CTP_AD || defined OP_CTP_AR #ifdef OP_CTP_AD @@ -257,8 +298,8 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v) dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \ dstT cartToPolar = y2 <= x2 ? x * y / (x2 + 0.28f * y2 + CV_EPSILON) + tmp : (tmp1 - x * y / (y2 + 0.28f * x2 + CV_EPSILON)); \ TO_DEGREE \ - dstelem = magnitude; \ - dstelem2 = cartToPolar + storedst(magnitude); \ + storedst2(cartToPolar) #elif defined OP_PTC_AD || defined OP_PTC_AR #ifdef OP_PTC_AD @@ -272,15 +313,15 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v) #define PROCESS_ELEM \ dstT x = srcelem1, y = srcelem2; \ FROM_DEGREE; \ - dstelem = cos(alpha) * x; \ - dstelem2 = sin(alpha) * x + storedst(cos(alpha) * x); \ + storedst2(sin(alpha) * x) #elif defined OP_PATCH_NANS #undef EXTRA_PARAMS #define EXTRA_PARAMS , int val #define PROCESS_ELEM \ if (( srcelem1 & 0x7fffffff) > 0x7f800000 ) \ - dstelem = val + storedst(val) #else #error "unknown op type" @@ -290,18 +331,26 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v) #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2 #undef EXTRA_INDEX - #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT) + dstoffset2) + #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT_C1)*cn + dstoffset2) #endif #if defined UNARY_OP || defined MASK_UNARY_OP -#undef srcelem2 + #if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \ defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \ defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \ defined OP_MUL || defined OP_DIV || defined OP_POWN #undef EXTRA_PARAMS - #define EXTRA_PARAMS , workT srcelem2 + #define EXTRA_PARAMS , workST srcelem2_ + #undef srcelem2 + #define srcelem2 srcelem2_ #endif + +#if cn == 3 +#undef srcelem2 +#define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z) +#endif + #endif #if defined BINARY_OP @@ -316,11 +365,11 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, if (x < cols && y < rows) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) - int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2); + int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2); #endif - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); EXTRA_INDEX; PROCESS_ELEM; @@ -343,9 +392,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); - int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); + int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); PROCESS_ELEM; } @@ -363,9 +412,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, if (x < cols && y < rows) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); - EXTRA_INDEX; + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); PROCESS_ELEM; } @@ -386,8 +434,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); PROCESS_ELEM; } -- 2.7.4