multiple rows in KF kernel

author Ilya Lavrenov <ilya.lavrenov@itseez.com>

Wed, 14 May 2014 11:42:30 +0000 (15:42 +0400)

committer Ilya Lavrenov <ilya.lavrenov@itseez.com>

Mon, 26 May 2014 09:19:06 +0000 (12:19 +0300)
author Ilya Lavrenov <ilya.lavrenov@itseez.com>
Wed, 14 May 2014 11:42:30 +0000 (15:42 +0400)
committer Ilya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 26 May 2014 09:19:06 +0000 (12:19 +0300)
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp

index 8e01de1..87e2514 100644 (file)
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1008,7 +1008,8 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
      int srcdepth = CV_MAT_DEPTH(srctype);
      int cn = CV_MAT_CN(srctype);
  
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device d = ocl::Device::getDefault();
+    bool doubleSupport = d.doubleFPConfig() > 0;
      if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
              (!doubleSupport && srcdepth == CV_64F && !bitwise))
          return false;
@@ -1016,8 +1017,9 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
      char opts[1024];
      int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
      int scalarcn = kercn == 3 ? 4 : kercn;
+    int rowsPerWI = d.isIntel() ? 4 : 1;
  
-    sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d",
+    sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
              haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
              bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
                  ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
@@ -1025,7 +1027,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                  ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
              bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
                  ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
-            kercn);
+            kercn, rowsPerWI);
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
      if (k.empty())
@@ -1068,7 +1070,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
              k.args(src1arg, src2arg, maskarg, dstarg);
      }
  
-    size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
+    size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, 0, false);
  }
  
@@ -1371,7 +1373,8 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                            void* usrdata, int oclop,
                            bool haveScalar )
  {
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device d = ocl::Device::getDefault();
+    bool doubleSupport = d.doubleFPConfig() > 0;
      int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
      bool haveMask = !_mask.empty();
  
@@ -1388,12 +1391,12 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
          return false;
  
      int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
-    int scalarcn = kercn == 3 ? 4 : kercn;
+    int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
  
      char cvtstr[4][32], opts[1024];
      sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
              "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
-            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
+            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d",
              (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
              oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
              ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
@@ -1404,7 +1407,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
              ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
              ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
              ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
-            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn);
+            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI);
  
      size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
      const uchar* usrdata_p = (const uchar*)usrdata;
@@ -1478,7 +1481,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
              k.args(src1arg, src2arg, maskarg, dstarg);
      }
  
-    size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
+    size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
@@ -2764,7 +2767,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
      if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
              return false;
  
-    int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
+    int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
      // Workaround for bug with "?:" operator in AMD OpenCL compiler
      if (depth1 >= CV_16U)
          kercn = 1;
@@ -2775,14 +2778,14 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
  
      String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
                           " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
-                         " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s",
+                         " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
                           haveScalar ? "UNARY_OP" : "BINARY_OP",
                           ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
                           ocl::typeToStr(CV_8UC(kercn)), kercn,
                           ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
                           operationMap[op], ocl::typeToStr(depth1),
                           ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
-                         ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)),
+                         ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
                           doubleSupport ? " -D DOUBLE_SUPPORT" : "");
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
@@ -2839,7 +2842,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
                 ocl::KernelArg::WriteOnly(dst, cn, kercn));
      }
  
-    size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows };
+    size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp

index d88e422..7fc3176 100644 (file)
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1357,9 +1357,10 @@ static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
  
  static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
  {
+    const ocl::Device & d = ocl::Device::getDefault();
      int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-        kercn = ocl::predictOptimalVectorWidth(_src, _dst);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+        kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
+    bool doubleSupport = d.doubleFPConfig() > 0;
  
      if (!doubleSupport && depth == CV_64F)
          return false;
@@ -1368,13 +1369,14 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
      int wdepth = std::max(depth, CV_32F);
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
                    format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
-                         " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s -D workT1=%s%s",
+                         " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
+                         " -D workT1=%s -D rowsPerWI=%d%s",
                           ocl::typeToStr(CV_8UC(kercn)),
                           ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
                           ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
                           ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
                           ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
-                         ocl::typeToStr(wdepth),
+                         ocl::typeToStr(wdepth), rowsPerWI,
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
@@ -1391,7 +1393,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
      else if (wdepth == CV_64F)
          k.args(srcarg, dstarg, alpha, beta);
  
-    size_t globalsize[2] = { src.cols * cn / kercn, src.rows };
+    size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp

index f045eec..1893214 100644 (file)
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -65,13 +65,15 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in
      int kercn = oclop == OCL_OP_PHASE_DEGREES ||
              oclop == OCL_OP_PHASE_RADIANS ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
  
-    bool double_support = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device d = ocl::Device::getDefault();
+    bool double_support = d.doubleFPConfig() > 0;
      if (!double_support && depth == CV_64F)
          return false;
+    int rowsPerWI = d.isIntel() ? 4 : 1;
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D %s -D %s -D dstT=%s%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP",
-                         oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+                  format("-D %s -D %s -D dstT=%s -D rowsPerWI=%d%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP",
+                         oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), rowsPerWI,
                           double_support ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
@@ -89,7 +91,7 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in
      else
          k.args(src1arg, src2arg, dstarg);
  
-    size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
+    size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, 0, false);
  }
  
@@ -524,8 +526,10 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
  static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
                               OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
  {
-    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device & d = ocl::Device::getDefault();
+    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            rowsPerWI = d.isIntel() ? 4 : 1;
+    bool doubleSupport = d.doubleFPConfig() > 0;
  
      if ( !(_src1.dims() <= 2 && _src2.dims() <= 2 &&
             (depth == CV_32F || depth == CV_64F) && type == _src2.type()) ||
@@ -533,9 +537,9 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
          return false;
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D BINARY_OP -D dstT=%s -D depth=%d -D OP_CTP_%s%s",
+                  format("-D BINARY_OP -D dstT=%s -D depth=%d -D rowsPerWI=%d -D OP_CTP_%s%s",
                           ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
-                         depth, angleInDegrees ? "AD" : "AR",
+                         depth, rowsPerWI, angleInDegrees ? "AD" : "AR",
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
@@ -553,7 +557,7 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
             ocl::KernelArg::WriteOnly(dst1, cn),
             ocl::KernelArg::WriteOnlyNoSize(dst2));
  
-    size_t globalsize[2] = { dst1.cols * cn, dst1.rows };
+    size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
@@ -713,16 +717,18 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
  static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
                               OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
  {
-    int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device & d = ocl::Device::getDefault();
+    int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            rowsPerWI = d.isIntel() ? 4 : 1;
+    bool doubleSupport = d.doubleFPConfig() > 0;
  
      if ( !doubleSupport && depth == CV_64F )
          return false;
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D depth=%d -D BINARY_OP -D OP_PTC_%s%s",
-                         ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth,
-                         angleInDegrees ? "AD" : "AR",
+                  format("-D dstT=%s -D rowsPerWI=%d -D depth=%d -D BINARY_OP -D OP_PTC_%s%s",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), rowsPerWI,
+                         depth, angleInDegrees ? "AD" : "AR",
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
@@ -738,7 +744,7 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
      k.args(ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::ReadOnlyNoSize(angle),
             ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnlyNoSize(dst2));
  
-    size_t globalsize[2] = { dst1.cols * cn, dst1.rows };
+    size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
@@ -2103,8 +2109,10 @@ static IPowFunc ipowTab[] =
  static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
                      bool is_ipower, int ipower)
  {
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device & d = ocl::Device::getDefault();
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            rowsPerWI = d.isIntel() ? 4 : 1;
+    bool doubleSupport = d.doubleFPConfig() > 0;
  
      if (depth == CV_64F && !doubleSupport)
          return false;
@@ -2113,8 +2121,8 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
      const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(depth),
-                         op, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                  format("-D dstT=%s -D rowsPerWI=%d -D %s -D UNARY_OP%s", ocl::typeToStr(depth),
+                         rowsPerWI, op, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
  
@@ -2137,7 +2145,7 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
              k.args(srcarg, dstarg, power);
      }
  
-    size_t globalsize[2] = { dst.cols *  cn, dst.rows };
+    size_t globalsize[2] = { dst.cols *  cn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
@@ -2491,8 +2499,10 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma
  
  static bool ocl_patchNaNs( InputOutputArray _a, float value )
  {
+    int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                     format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int"));
+                     format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int -D rowsPerWI=%d",
+                            rowsPerWI));
      if (k.empty())
          return false;
  
@@ -2502,7 +2512,7 @@ static bool ocl_patchNaNs( InputOutputArray _a, float value )
      k.args(ocl::KernelArg::ReadOnlyNoSize(a),
             ocl::KernelArg::WriteOnly(a, cn), (float)value);
  
-    size_t globalsize[2] = { a.cols * cn, a.rows };
+    size_t globalsize[2] = { a.cols * cn, (a.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp

index bf1428e..30ce75b 100644 (file)
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -2153,9 +2153,10 @@ typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, i
  
  static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
  {
+    const ocl::Device & d = ocl::Device::getDefault();
      int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F),
-            kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+            kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
+    bool doubleSupport = d.doubleFPConfig() > 0;
      Size size = _src1.size();
  
      if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
@@ -2164,13 +2165,14 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp
      char cvt[2][50];
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
                    format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
-                         " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s -D wdepth=%d%s",
+                         " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s"
+                         " -D wdepth=%d%s -D rowsPerWI=%d",
                           ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
                           ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
                           ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
                           ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]),
                           ocl::typeToStr(wdepth), wdepth,
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI));
      if (k.empty())
          return false;
  
@@ -2187,7 +2189,7 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp
      else
          k.args(src1arg, src2arg, dstarg, alpha);
  
-    size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows };
+    size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
      return k.run(2, globalsize, NULL, false);
  }
  
diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl

index 5faf7de..1f15d76 100644 (file)
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -145,6 +145,7 @@
  
  #define EXTRA_PARAMS
  #define EXTRA_INDEX
+#define EXTRA_INDEX_ADD
  
  #if defined OP_ADD
  #define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))
@@ -363,7 +364,9 @@
      #undef EXTRA_PARAMS
      #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
      #undef EXTRA_INDEX
-    #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
+    #define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
+    #undef EXTRA_INDEX_ADD
+    #define EXTRA_INDEX_ADD dst_index2 += dststep2
  #endif
  
  #if defined UNARY_OP || defined MASK_UNARY_OP
@@ -393,18 +396,25 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y0 = get_global_id(1) * rowsPerWI;
  
-    if (x < cols && y < rows)
+    if (x < cols)
      {
-        int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+        int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
  #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
-        int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
+        int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
  #endif
-        int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+        int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
          EXTRA_INDEX;
  
-        PROCESS_ELEM;
+        for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
+        {
+            PROCESS_ELEM;
+#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
+            src2_index += srcstep2;
+#endif
+            EXTRA_INDEX_ADD;
+        }
      }
  }
  
@@ -417,19 +427,21 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y0 = get_global_id(1) * rowsPerWI;
  
-    if (x < cols && y < rows)
+    if (x < cols)
      {
-        int mask_index = mad24(y, maskstep, x + maskoffset);
-        if( mask[mask_index] )
-        {
-            int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
-            int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
-            int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
-
-            PROCESS_ELEM;
-        }
+        int mask_index = mad24(y0, maskstep, x + maskoffset);
+        int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+        int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
+        int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+
+        for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2,
+                                                                mask_index += maskstep, dst_index += dststep)
+            if (mask[mask_index])
+            {
+                PROCESS_ELEM;
+            }
      }
  }
  
@@ -440,14 +452,17 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y0 = get_global_id(1) * rowsPerWI;
  
-    if (x < cols && y < rows)
+    if (x < cols)
      {
-        int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
-        int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+        int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+        int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
  
-        PROCESS_ELEM;
+        for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
+        {
+            PROCESS_ELEM;
+        }
      }
  }
  
@@ -459,18 +474,19 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y0 = get_global_id(1);
  
-    if (x < cols && y < rows)
+    if (x < cols)
      {
-        int mask_index = mad24(y, maskstep, x + maskoffset);
-        if( mask[mask_index] )
-        {
-            int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
-            int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
-
-            PROCESS_ELEM;
-        }
+        int mask_index = mad24(y0, maskstep, x + maskoffset);
+        int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+        int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+
+        for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep)
+            if (mask[mask_index])
+            {
+                PROCESS_ELEM;
+            }
      }
  }
  
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp

index 0584496..8877fb5 100644 (file)
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -1973,8 +1973,9 @@ static NormDiffFunc getNormDiffFunc(int normType, int depth)
  
  static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
  {
+    const ocl::Device & d = ocl::Device::getDefault();
      int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+    bool doubleSupport = d.doubleFPConfig() > 0,
              haveMask = _mask.kind() != _InputArray::NONE;
  
      if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
@@ -1991,13 +1992,14 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
  
              if (depth != CV_8U && depth != CV_16U)
              {
-                int wdepth = std::max(CV_32S, depth);
+                int wdepth = std::max(CV_32S, depth), rowsPerWI = d.isIntel() ? 4 : 1;
                  char cvt[50];
  
                  ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc,
-                                 format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s -D convertToDT=%s%s",
+                                 format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s"
+                                        " -D convertToDT=%s -D rowsPerWI=%d%s",
                                          ocl::typeToStr(wdepth), ocl::typeToStr(depth),
-                                        ocl::convertTypeStr(depth, wdepth, 1, cvt),
+                                        ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
                                          doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
                  if (kabs.empty())
                      return false;
@@ -2005,7 +2007,7 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
                  abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn));
                  kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn));
  
-                size_t globalsize[2] = { src.cols * cn, src.rows };
+                size_t globalsize[2] = { src.cols * cn, (src.rows + rowsPerWI - 1) / rowsPerWI };
                  if (!kabs.run(2, globalsize, NULL, false))
                      return false;
              }
@@ -2016,8 +2018,8 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
          }
          else
          {
-            int dbsize = ocl::Device::getDefault().maxComputeUnits();
-            size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+            int dbsize = d.maxComputeUnits();
+            size_t wgs = d.maxWorkGroupSize();
  
              int wgs2_aligned = 1;
              while (wgs2_aligned < (int)wgs)
@@ -2384,8 +2386,9 @@ namespace cv {
  
  static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
  {
-    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device & d = ocl::Device::getDefault();
+    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1;
+    bool doubleSupport = d.doubleFPConfig() > 0;
      bool relative = (normType & NORM_RELATIVE) != 0;
      normType &= ~NORM_RELATIVE;
  
@@ -2397,9 +2400,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
      char cvt[50];
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
                    format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1"
-                         " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT%s",
+                         " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s",
                           ocl::typeToStr(wdepth), ocl::typeToStr(depth),
-                         ocl::convertTypeStr(depth, wdepth, 1, cvt),
+                         ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
@@ -2408,7 +2411,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
      k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2),
             ocl::KernelArg::WriteOnly(diff, cn));
  
-    size_t globalsize[2] = { diff.cols * cn, diff.rows };
+    size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI };
      if (!k.run(2, globalsize, NULL, false))
          return false;
author	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Wed, 14 May 2014 11:42:30 +0000 (15:42 +0400)
committer	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Mon, 26 May 2014 09:19:06 +0000 (12:19 +0300)
modules/core/src/arithm.cpp		patch \| blob \| history
modules/core/src/convert.cpp		patch \| blob \| history
modules/core/src/mathfuncs.cpp		patch \| blob \| history
modules/core/src/matmul.cpp		patch \| blob \| history
modules/core/src/opencl/arithm.cl		patch \| blob \| history
modules/core/src/stat.cpp		patch \| blob \| history