optimized cv::norm with 2 args

author Ilya Lavrenov <ilya.lavrenov@itseez.com>

Sat, 7 Jun 2014 11:51:41 +0000 (15:51 +0400)

committer Ilya Lavrenov <ilya.lavrenov@itseez.com>

Sat, 7 Jun 2014 16:45:28 +0000 (20:45 +0400)
author Ilya Lavrenov <ilya.lavrenov@itseez.com>
Sat, 7 Jun 2014 11:51:41 +0000 (15:51 +0400)
committer Ilya Lavrenov <ilya.lavrenov@itseez.com>
Sat, 7 Jun 2014 16:45:28 +0000 (20:45 +0400)
diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl

index 2e48387..e3d87b0 100644 (file)
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -73,15 +73,27 @@
  #define CALC_MAX(p, inc)
  #endif
  
+#ifdef OP_CALC2
+#define CALC_MAX2(p) \
+    if (maxval2 < temp.p) \
+        maxval2 = temp.p
+#else
+#define CALC_MAX2(p)
+#endif
+
  #define CALC_P(p, inc) \
      CALC_MIN(p, inc) \
-    CALC_MAX(p, inc)
+    CALC_MAX(p, inc) \
+    CALC_MAX2(p)
  
  __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,
                          int total, int groupnum, __global uchar * dstptr
  #ifdef HAVE_MASK
                          , __global const uchar * mask, int mask_step, int mask_offset
  #endif
+#ifdef HAVE_SRC2
+                        , __global const uchar * src2ptr, int src2_step, int src2_offset
+#endif
                          )
  {
      int lid = get_local_id(0);
@@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
  #ifdef HAVE_MASK
      mask += mask_offset;
  #endif
+#ifdef HAVE_SRC2
+    src2ptr += src2_offset;
+#endif
  
  #ifdef NEED_MINVAL
      __local dstT1 localmem_min[WGS2_ALIGNED];
+    dstT1 minval = MAX_VAL;
  #ifdef NEED_MINLOC
      __local uint localmem_minloc[WGS2_ALIGNED];
+    uint minloc = INDEX_MAX;
  #endif
  #endif
  #ifdef NEED_MAXVAL
+    dstT1 maxval = MIN_VAL;
      __local dstT1 localmem_max[WGS2_ALIGNED];
  #ifdef NEED_MAXLOC
      __local uint localmem_maxloc[WGS2_ALIGNED];
+    uint maxloc = INDEX_MAX;
+#endif
  #endif
+#ifdef OP_CALC2
+    __local dstT1 localmem_max2[WGS2_ALIGNED];
+    dstT1 maxval2 = MIN_VAL;
  #endif
  
-    dstT1 minval = MAX_VAL, maxval = MIN_VAL;
-    dstT temp;
-    uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
      int src_index;
  #ifdef HAVE_MASK
      int mask_index;
  #endif
+#ifdef HAVE_SRC2
+    int src2_index;
+#endif
  
-    for (int grain = groupnum * WGS * kercn; id < total; id += grain)
-    {
-#ifdef HAVE_SRC_CONT
-        src_index = mul24(id, (int)sizeof(srcT1));
-#else
-        src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+    dstT temp;
+#ifdef HAVE_SRC2
+    dstT temp2;
  #endif
  
+    for (int grain = groupnum * WGS * kercn; id < total; id += grain)
+    {
  #ifdef HAVE_MASK
  #ifdef HAVE_MASK_CONT
          mask_index = id;
@@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
          if (mask[mask_index])
  #endif
          {
+#ifdef HAVE_SRC_CONT
+            src_index = mul24(id, (int)sizeof(srcT1));
+#else
+            src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+#endif
              temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
+#ifdef OP_ABS
+            temp = temp >= (dstT)(0) ? temp : -temp;
+#endif
+
+#ifdef HAVE_SRC2
+#ifdef HAVE_SRC2_CONT
+            src2_index = mul24(id, (int)sizeof(srcT1));
+#else
+            src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
+#endif
+            temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
+            temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
+#endif
+
  #if kercn == 1
  #ifdef NEED_MINVAL
              if (minval > temp)
@@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
                  maxloc = id;
  #endif
              }
+#ifdef OP_CALC2
+            temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
+            if (maxval2 < temp2)
+                maxval2 = temp2;
+#endif
  #endif
  #elif kercn >= 2
              CALC_P(s0, 0)
@@ -192,6 +238,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
  #ifdef NEED_MAXLOC
          localmem_maxloc[lid] = maxloc;
  #endif
+#ifdef OP_CALC2
+        localmem_max2[lid] = maxval2;
+#endif
      }
      barrier(CLK_LOCAL_MEM_FENCE);
  
@@ -222,6 +271,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
                  localmem_max[lid3] = maxval;
          }
  #endif
+#ifdef OP_CALC2
+        if (localmem_max2[lid3] < maxval2)
+            localmem_max2[lid3] = maxval2;
+#endif
      }
      barrier(CLK_LOCAL_MEM_FENCE);
  
@@ -255,6 +308,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
                      localmem_max[lid] = localmem_max[lid2];
              }
  #endif
+#ifdef OP_CALC2
+            if (localmem_max2[lid] < localmem_max2[lid2])
+                localmem_max2[lid] = localmem_max2[lid2];
+#endif
          }
          barrier(CLK_LOCAL_MEM_FENCE);
      }
@@ -277,5 +334,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
  #ifdef NEED_MAXLOC
          *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
  #endif
+#ifdef OP_CALC2
+        pos = mad24(groupnum, (int)sizeof(uint), pos);
+        *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
+#endif
      }
  }
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl

index 21a5518..d535079 100644 (file)
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -109,13 +109,22 @@
  #endif
  
  #ifdef HAVE_MASK
+#ifdef HAVE_SRC2
+#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
+#else
  #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+#else
+#ifdef HAVE_SRC2
+#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset
  #else
  #define EXTRA_PARAMS
  #endif
+#endif
  
  // accumulative reduction stuff
  #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
+
  #ifdef OP_DOT
  #if ddepth <= 4
  #define FUNC(a, b, c) a = mad24(b, c, a)
@@ -137,18 +146,48 @@
  #endif
  #endif
  
+#ifdef OP_CALC2
+#define DECLARE_LOCAL_MEM \
+    __local dstT localmem[WGS2_ALIGNED]; \
+    __local dstT localmem2[WGS2_ALIGNED]
+#define DEFINE_ACCUMULATOR \
+    dstT accumulator = (dstT)(0); \
+    dstT accumulator2 = (dstT)(0)
+#else
  #define DECLARE_LOCAL_MEM \
      __local dstT localmem[WGS2_ALIGNED]
  #define DEFINE_ACCUMULATOR \
      dstT accumulator = (dstT)(0)
+#endif
+
+#ifdef HAVE_SRC2
+#ifdef OP_CALC2
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
+    dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp -= temp2; \
+    temp = temp > (dstT)(0) ? temp : -temp; \
+    FUNC(accumulator2, temp2); \
+    FUNC(accumulator, temp)
+#else
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp)
+#endif
+#else
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+    FUNC(accumulator, temp)
+#endif
  
  #ifdef HAVE_MASK
  #define REDUCE_GLOBAL \
      MASK_INDEX; \
      if (mask[mask_index]) \
      { \
-        dstT temp = convertToDT(loadpix(srcptr + src_index)); \
-        FUNC(accumulator, temp); \
+        PROCESS_ELEMS; \
      }
  #elif defined OP_DOT
  
@@ -211,7 +250,158 @@
      FUNC(accumulator, temp.sF, temp2.sF)
  #endif
  
-#else
+#else // sum or norm with 2 args
+#ifdef HAVE_SRC2
+#ifdef OP_CALC2 // norm relative
+#if kercn == 1
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp); \
+    FUNC(accumulator2, temp2)
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1)
+#elif kercn == 4
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3)
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3); \
+    FUNC(accumulator2, temp2.s4); \
+    FUNC(accumulator2, temp2.s5); \
+    FUNC(accumulator2, temp2.s6); \
+    FUNC(accumulator2, temp2.s7)
+#elif kercn == 16
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator, temp.s8); \
+    FUNC(accumulator, temp.s9); \
+    FUNC(accumulator, temp.sA); \
+    FUNC(accumulator, temp.sB); \
+    FUNC(accumulator, temp.sC); \
+    FUNC(accumulator, temp.sD); \
+    FUNC(accumulator, temp.sE); \
+    FUNC(accumulator, temp.sF); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3); \
+    FUNC(accumulator2, temp2.s4); \
+    FUNC(accumulator2, temp2.s5); \
+    FUNC(accumulator2, temp2.s6); \
+    FUNC(accumulator2, temp2.s7); \
+    FUNC(accumulator2, temp2.s8); \
+    FUNC(accumulator2, temp2.s9); \
+    FUNC(accumulator2, temp2.sA); \
+    FUNC(accumulator2, temp2.sB); \
+    FUNC(accumulator2, temp2.sC); \
+    FUNC(accumulator2, temp2.sD); \
+    FUNC(accumulator2, temp2.sE); \
+    FUNC(accumulator2, temp2.sF)
+#endif
+#else // norm with 2 args
+#if kercn == 1
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp)
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1)
+#elif kercn == 4
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3)
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7)
+#elif kercn == 16
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator, temp.s8); \
+    FUNC(accumulator, temp.s9); \
+    FUNC(accumulator, temp.sA); \
+    FUNC(accumulator, temp.sB); \
+    FUNC(accumulator, temp.sC); \
+    FUNC(accumulator, temp.sD); \
+    FUNC(accumulator, temp.sE); \
+    FUNC(accumulator, temp.sF)
+#endif
+#endif
+
+#else // sum
  #if kercn == 1
  #define REDUCE_GLOBAL \
      dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
@@ -260,6 +450,7 @@
      FUNC(accumulator, temp.sF)
  #endif
  #endif
+#endif
  
  #define SET_LOCAL_1 \
      localmem[lid] = accumulator
@@ -325,6 +516,20 @@
      accumulator += value.sF == zero ? zero : one
  #endif
  
+#ifdef OP_CALC2
+#define SET_LOCAL_1 \
+    localmem[lid] = accumulator; \
+    localmem2[lid] = accumulator2; \
+#define REDUCE_LOCAL_1 \
+    localmem[lid - WGS2_ALIGNED] += accumulator; \
+    localmem2[lid - WGS2_ALIGNED] += accumulator2
+#define REDUCE_LOCAL_2 \
+    localmem[lid] += localmem[lid2]; \
+    localmem2[lid] += localmem2[lid2]
+#define CALC_RESULT \
+    storepix(localmem[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
+#else
  #define SET_LOCAL_1 \
      localmem[lid] = accumulator
  #define REDUCE_LOCAL_1 \
@@ -333,6 +538,7 @@
      localmem[lid] += localmem[lid2]
  #define CALC_RESULT \
      storepix(localmem[0], dstptr + dstTSIZE * gid)
+#endif
  
  // norm (NORM_INF) with cn > 1 and mask
  #elif defined OP_NORM_INF_MASK
@@ -385,6 +591,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
  #else
          int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
  #endif
+#ifdef HAVE_SRC2
+#ifdef HAVE_SRC2_CONT
+        int src2_index = mul24(id, srcTSIZE);
+#else
+        int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
+#endif
+#endif
          REDUCE_GLOBAL;
      }
  
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp

index 8996c48..b405d6f 100644 (file)
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -469,21 +469,25 @@ template <typename T> Scalar ocl_part_sum(Mat m)
  
  enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
  
-static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() )
+static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
+                     InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
  {
      CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
  
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
-            haveMask = _mask.kind() != _InputArray::NONE;
+    const ocl::Device & dev = ocl::Device::getDefault();
+    bool doubleSupport = dev.doubleFPConfig() > 0,
+        haveMask = _mask.kind() != _InputArray::NONE,
+        haveSrc2 = _src2.kind() != _InputArray::NONE;
      int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
              kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
              mcn = std::max(cn, kercn);
+    CV_Assert(!haveSrc2 || _src2.type() == type);
  
      if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
          return false;
  
-    int dbsize = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
+    size_t wgs = dev.maxWorkGroupSize();
  
      int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
              dtype = CV_MAKE_TYPE(ddepth, cn);
@@ -497,7 +501,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
      static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
      char cvt[40];
      String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
-                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d",
+                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s",
                           ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
                           ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
                           ocl::typeToStr(ddepth), ddepth, cn,
@@ -506,30 +510,49 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
                           doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                           haveMask ? " -D HAVE_MASK" : "",
                           _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
-                         _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn);
+                         haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
+                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
+                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
  
      ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
      if (k.empty())
          return false;
  
-    UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat();
+    UMat src = _src.getUMat(), src2 = _src2.getUMat(),
+        db(1, dbsize, dtype), mask = _mask.getUMat();
  
      ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
              dbarg = ocl::KernelArg::PtrWriteOnly(db),
-            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
  
      if (haveMask)
-        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg);
+    {
+        if (haveSrc2)
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
+        else
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
+    }
      else
-        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg);
+    {
+        if (haveSrc2)
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
+        else
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
+    }
  
-    size_t globalsize = dbsize * wgs;
+    size_t globalsize = ngroups * wgs;
      if (k.run(1, &globalsize, &wgs, false))
      {
          typedef Scalar (*part_sum)(Mat m);
          part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
                  func = funcs[ddepth - CV_32S];
-        res = func(db.getMat(ACCESS_READ));
+
+        Mat mres = db.getMat(ACCESS_READ);
+        if (calc2)
+            const_cast<Scalar &>(res2) = func(mres.colRange(dbsize, dbsize));
+
+        res = func(mres.colRange(0, dbsize));
          return true;
      }
      return false;
@@ -1396,18 +1419,21 @@ typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal,
                                   int *minLoc, int *maxLoc, int gropunum, int cols);
  
  static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
-                           int ddepth = -1, bool absValues = false)
+                           int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), bool calc2 = false)
  {
      CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
          (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
  
      const ocl::Device & dev = ocl::Device::getDefault();
-    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty();
+    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
+        haveSrc2 = _src2.kind() != _InputArray::NONE;
      int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
              kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
      if (ddepth < 0)
          ddepth = depth;
  
+    CV_Assert(!haveSrc2 || _src2.type() == type);
+
      if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
          return false;
  
@@ -1435,7 +1461,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
      char cvt[40];
      String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
                           " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
-                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s",
+                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s",
                           depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
                           ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
                           doubleSupport ? " -D DOUBLE_SUPPORT" : "",
@@ -1444,7 +1470,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                           needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
                           needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
                           ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
-                         ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "");
+                         ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "",
+                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
+                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
  
      ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
      if (k.empty())
@@ -1452,18 +1480,35 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
  
      int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
              dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
-                                 (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0));
-    UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
+                                 (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
+                                 (calc2 ? esz : 0));
+    UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
  
      if (cn > 1)
+    {
          src = src.reshape(1);
+        src2 = src2.reshape(1);
+    }
  
-    if (!haveMask)
-        k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-               groupnum, ocl::KernelArg::PtrWriteOnly(db));
+    if (haveSrc2)
+    {
+        if (!haveMask)
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
+        else
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
+                   ocl::KernelArg::ReadOnlyNoSize(src2));
+    }
      else
-        k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-               groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
+    {
+        if (!haveMask)
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db));
+        else
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
+    }
  
      size_t globalsize = groupnum * wgs;
      if (!k.run(1, &globalsize, &wgs, false))
@@ -2498,38 +2543,45 @@ namespace cv {
  
  static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
  {
-    const ocl::Device & d = ocl::Device::getDefault();
-    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1;
-    bool doubleSupport = d.doubleFPConfig() > 0;
-    bool relative = (normType & NORM_RELATIVE) != 0;
+    Scalar sc1, sc2;
+    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    bool relative = (normType & NORM_RELATIVE) != 0,
+        normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
      normType &= ~NORM_RELATIVE;
  
-    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
-         (!doubleSupport && depth == CV_64F))
+    if ( !(normType == NORM_INF || normsum) )
          return false;
  
-    int wdepth = std::max(CV_32S, depth);
-    char cvt[50];
-    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1"
-                         " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s",
-                         ocl::typeToStr(wdepth), ocl::typeToStr(depth),
-                         ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
-    if (k.empty())
-        return false;
+    if (normsum)
+    {
+        if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
+                     OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
+            return false;
+    }
+    else
+    {
+        if (!ocl_minMaxIdx(_src1, NULL, &result, NULL, NULL, _mask, std::max(CV_32S, depth),
+                           false, _src2, relative))
+            return false;
+    }
  
-    UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), diff(src1.size(), CV_MAKE_TYPE(wdepth, cn));
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2),
-           ocl::KernelArg::WriteOnly(diff, cn));
+    double s2 = 0;
+    for (int i = 0; i < cn; ++i)
+    {
+        result += sc1[i];
+        if (relative)
+            s2 += sc2[i];
+    }
  
-    size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI };
-    if (!k.run(2, globalsize, NULL, false))
-        return false;
+    if (normType == NORM_L2)
+    {
+        result = std::sqrt(result);
+        if (relative)
+            s2 = std::sqrt(s2);
+    }
  
-    result = cv::norm(diff, normType, _mask);
      if (relative)
-        result /= cv::norm(src2, normType, _mask) + DBL_EPSILON;
+        result /= (s2 + DBL_EPSILON);
  
      return true;
  }
author	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Sat, 7 Jun 2014 11:51:41 +0000 (15:51 +0400)
committer	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Sat, 7 Jun 2014 16:45:28 +0000 (20:45 +0400)
modules/core/src/opencl/minmaxloc.cl		patch \| blob \| history
modules/core/src/opencl/reduce.cl		patch \| blob \| history
modules/core/src/stat.cpp		patch \| blob \| history