core tapi optimization

author Ilya Lavrenov <ilya.lavrenov@itseez.com>

Mon, 24 Feb 2014 20:29:17 +0000 (00:29 +0400)

committer Ilya Lavrenov <ilya.lavrenov@itseez.com>

Tue, 25 Feb 2014 10:14:42 +0000 (14:14 +0400)
author Ilya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 24 Feb 2014 20:29:17 +0000 (00:29 +0400)
committer Ilya Lavrenov <ilya.lavrenov@itseez.com>
Tue, 25 Feb 2014 10:14:42 +0000 (14:14 +0400)
diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp

index 4acfa7a..55f8849 100644 (file)
--- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
+++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
@@ -5,7 +5,7 @@
  // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
  // Third party copyrights are property of their respective owners.
  
-//#define CV_OPENCL_RUN_VERBOSE
+//#define CV_OPENCL_RUN_ASSERT
  
  #ifdef HAVE_OPENCL
  
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp

index a837452..623f9d0 100644 (file)
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1318,7 +1318,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
  
      char cvtstr[4][32], opts[1024];
      sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
-            "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D convertToWT1=%s "
+            "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
              "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
              (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
              oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
@@ -1329,7 +1329,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
              ocl::typeToStr(CV_MAKETYPE(ddepth, 1)),
              ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
              ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
-            ocl::typeToStr(CV_MAKETYPE(wdepth, 1)),
+            ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), wdepth,
              ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
              ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
              ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp

index e64d099..c314823 100644 (file)
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1320,8 +1320,8 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
      int wdepth = std::max(depth, CV_32F);
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
                    format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=uchar -D srcT1=%s"
-                         " -D workT=%s -D convertToWT1=%s -D convertToDT=%s%s",
-                         ocl::typeToStr(depth), ocl::typeToStr(wdepth),
+                         " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(wdepth), wdepth,
                           ocl::convertTypeStr(depth, wdepth, 1, cvt[0]),
                           ocl::convertTypeStr(wdepth, CV_8U, 1, cvt[1]),
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
@@ -1492,19 +1492,14 @@ static LUTFunc lutTab[] =
  static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
  {
      int dtype = _dst.type(), lcn = _lut.channels(), dcn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-
-    if (_src.dims() > 2 || (!doubleSupport && ddepth == CV_64F))
-        return false;
  
      UMat src = _src.getUMat(), lut = _lut.getUMat();
      _dst.create(src.size(), dtype);
      UMat dst = _dst.getUMat();
  
      ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
-                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s%s", dcn, lcn,
-                         ocl::typeToStr(src.depth()), ocl::typeToStr(ddepth),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn,
+                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
      if (k.empty())
          return false;
  
@@ -1528,7 +1523,7 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
          _lut.total() == 256 && _lut.isContinuous() &&
          (depth == CV_8U || depth == CV_8S) );
  
-    CV_OCL_RUN(_dst.isUMat(),
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
                 ocl_LUT(_src, _lut, _dst))
  
      Mat src = _src.getMat(), lut = _lut.getMat();
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp

index f81e835..d7fad62 100644 (file)
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -508,9 +508,9 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
          return false;
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D BINARY_OP -D dstT=%s -D OP_CTP_%s%s",
+                  format("-D BINARY_OP -D dstT=%s -D depth=%d -D OP_CTP_%s%s",
                           ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
-                         angleInDegrees ? "AD" : "AR",
+                         depth, angleInDegrees ? "AD" : "AR",
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
@@ -695,8 +695,8 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
          return false;
  
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D BINARY_OP -D OP_PTC_%s%s",
-                         ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+                  format("-D dstT=%s -D depth=%d -D BINARY_OP -D OP_PTC_%s%s",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth,
                           angleInDegrees ? "AD" : "AR",
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp

index c6dde65..022f88e 100644 (file)
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -2166,9 +2166,9 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp
  
      char cvt[2][50];
      ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
+                  format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D wdepth=%d -D convertToWT1=%s"
                           " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s%s", ocl::typeToStr(depth),
-                         ocl::typeToStr(wdepth), ocl::convertTypeStr(depth, wdepth, 1, cvt[0]),
+                         ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth, wdepth, 1, cvt[0]),
                           ocl::convertTypeStr(wdepth, depth, 1, cvt[1]),
                           doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl

index a7dacc4..bac72d3 100644 (file)
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -63,11 +63,12 @@
  #elif defined cl_khr_fp64
  #pragma OPENCL EXTENSION cl_khr_fp64:enable
  #endif
-#define CV_EPSILON DBL_EPSILON
-#define CV_PI M_PI
-#else
-#define CV_EPSILON FLT_EPSILON
+#endif
+
+#if depth <= 5
  #define CV_PI M_PI_F
+#else
+#define CV_PI M_PI
  #endif
  
  #ifndef cn
@@ -84,11 +85,11 @@
  #endif
  
  #if cn != 3
-    #define storedst(val) *(__global dstT*)(dstptr + dst_index) = val
-    #define storedst2(val) *(__global dstT*)(dstptr2 + dst_index2) = val
+    #define storedst(val) *(__global dstT *)(dstptr + dst_index) = val
+    #define storedst2(val) *(__global dstT *)(dstptr2 + dst_index2) = val
  #else
-    #define storedst(val) vstore3(val, 0, (__global dstT_C1*)(dstptr + dst_index))
-    #define storedst2(val) vstore3(val, 0, (__global dstT_C1*)(dstptr2 + dst_index2))
+    #define storedst(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr + dst_index))
+    #define storedst2(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr2 + dst_index2))
  #endif
  
  #define noconvert
@@ -97,19 +98,27 @@
  
      #ifndef srcT1
      #define srcT1 dstT
+    #endif
+
+    #ifndef srcT1_C1
      #define srcT1_C1 dstT_C1
      #endif
+
      #ifndef srcT2
      #define srcT2 dstT
+    #endif
+
+    #ifndef srcT2_C1
      #define srcT2_C1 dstT_C1
      #endif
+
      #define workT dstT
      #if cn != 3
-        #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index)
-        #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index)
+        #define srcelem1 *(__global srcT1 *)(srcptr1 + src1_index)
+        #define srcelem2 *(__global srcT2 *)(srcptr2 + src2_index)
      #else
-        #define srcelem1 vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index))
-        #define srcelem2 vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index))
+        #define srcelem1 vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index))
+        #define srcelem2 vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index))
      #endif
      #ifndef convertToDT
      #define convertToDT noconvert
@@ -121,11 +130,11 @@
      #define convertToWT2 convertToWT1
      #endif
      #if cn != 3
-        #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index))
-        #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index))
+        #define srcelem1 convertToWT1(*(__global srcT1 *)(srcptr1 + src1_index))
+        #define srcelem2 convertToWT2(*(__global srcT2 *)(srcptr2 + src2_index))
      #else
-        #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index)))
-        #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index)))
+        #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index)))
+        #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index)))
      #endif
  
  #endif
@@ -224,7 +233,11 @@
  #elif defined OP_ADDW
  #undef EXTRA_PARAMS
  #define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma
-#define PROCESS_ELEM storedst(convertToDT(srcelem1*alpha + srcelem2*beta + gamma))
+#if wdepth <= 4
+#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma))))
+#else
+#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, mad(srcelem2, beta, gamma))))
+#endif
  
  #elif defined OP_MAG
  #define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))
@@ -274,16 +287,31 @@
  #elif defined OP_CONVERT_SCALE_ABS
  #undef EXTRA_PARAMS
  #define EXTRA_PARAMS , workT alpha, workT beta
+#if wdepth <= 4
  #define PROCESS_ELEM \
-    workT value = srcelem1 * alpha + beta; \
+    workT value = mad24(srcelem1, alpha, beta); \
      storedst(convertToDT(value >= 0 ? value : -value))
+#else
+#define PROCESS_ELEM \
+    workT value = mad(srcelem1, alpha, beta); \
+    storedst(convertToDT(value >= 0 ? value : -value))
+#endif
  
  #elif defined OP_SCALE_ADD
  #undef EXTRA_PARAMS
  #define EXTRA_PARAMS , workT alpha
-#define PROCESS_ELEM storedst(convertToDT(srcelem1 * alpha + srcelem2))
+#if wdepth <= 4
+#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, srcelem2)))
+#else
+#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, srcelem2)))
+#endif
  
  #elif defined OP_CTP_AD || defined OP_CTP_AR
+#if depth <= 5
+#define CV_EPSILON FLT_EPSILON
+#else
+#define CV_EPSILON DBL_EPSILON
+#endif
  #ifdef OP_CTP_AD
  #define TO_DEGREE cartToPolar *= (180 / CV_PI);
  #elif defined OP_CTP_AR
@@ -296,7 +324,7 @@
      dstT tmp = y >= 0 ? 0 : CV_PI * 2; \
      tmp = x < 0 ? CV_PI : tmp; \
      dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \
-    dstT cartToPolar = y2 <= x2 ? x * y / (x2 + 0.28f * y2 + CV_EPSILON) + tmp : (tmp1 - x * y / (y2 + 0.28f * x2 + CV_EPSILON)); \
+    dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \
      TO_DEGREE \
      storedst(magnitude); \
      storedst2(cartToPolar)
@@ -331,7 +359,7 @@
      #undef EXTRA_PARAMS
      #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
      #undef EXTRA_INDEX
-    #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT_C1)*cn + dstoffset2)
+    #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
  #endif
  
  #if defined UNARY_OP || defined MASK_UNARY_OP
@@ -355,9 +383,9 @@
  
  #if defined BINARY_OP
  
-__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
-                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
-                 __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar * srcptr2, int srcstep2, int srcoffset2,
+                 __global uchar * dstptr, int dststep, int dstoffset,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
@@ -365,11 +393,11 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
  
      if (x < cols && y < rows)
      {
-        int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
+        int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
  #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
-        int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2);
+        int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
  #endif
-        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
+        int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
          EXTRA_INDEX;
  
          PROCESS_ELEM;
@@ -378,10 +406,10 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
  
  #elif defined MASK_BINARY_OP
  
-__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
-                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
-                 __global const uchar* mask, int maskstep, int maskoffset,
-                 __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar * srcptr2, int srcstep2, int srcoffset2,
+                 __global const uchar * mask, int maskstep, int maskoffset,
+                 __global uchar * dstptr, int dststep, int dstoffset,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
@@ -392,9 +420,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
          int mask_index = mad24(y, maskstep, x + maskoffset);
          if( mask[mask_index] )
          {
-            int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
-            int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2);
-            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
+            int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+            int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
+            int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
  
              PROCESS_ELEM;
          }
@@ -403,8 +431,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
  
  #elif defined UNARY_OP
  
-__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
-                 __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
+                 __global uchar * dstptr, int dststep, int dstoffset,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
@@ -412,8 +440,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
  
      if (x < cols && y < rows)
      {
-        int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
-        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
+        int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+        int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
  
          PROCESS_ELEM;
      }
@@ -421,9 +449,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
  
  #elif defined MASK_UNARY_OP
  
-__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
-                 __global const uchar* mask, int maskstep, int maskoffset,
-                 __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar * mask, int maskstep, int maskoffset,
+                 __global uchar * dstptr, int dststep, int dstoffset,
                   int rows, int cols EXTRA_PARAMS )
  {
      int x = get_global_id(0);
@@ -434,8 +462,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
          int mask_index = mad24(y, maskstep, x + maskoffset);
          if( mask[mask_index] )
          {
-            int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
-            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
+            int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+            int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
  
              PROCESS_ELEM;
          }
diff --git a/modules/core/src/opencl/convert.cl b/modules/core/src/opencl/convert.cl

index 6c2d16c..b801409 100644 (file)
--- a/modules/core/src/opencl/convert.cl
+++ b/modules/core/src/opencl/convert.cl
@@ -53,19 +53,19 @@
  
  __kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                        float alpha, float beta )
+                        WT alpha, WT beta)
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
      if (x < dst_cols && y < dst_rows)
      {
-        int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(srcT) );
-        int dst_index = mad24(y, dst_step, dst_offset + x * (int)sizeof(dstT) );
+        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT), src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
  
          __global const srcT * src = (__global const srcT *)(srcptr + src_index);
          __global dstT * dst = (__global dstT *)(dstptr + dst_index);
  
-        dst[0] = convertToDT( src[0] * alpha + beta );
+        dst[0] = convertToDT(mad(convertToWT(src[0]), alpha, beta));
      }
  }
diff --git a/modules/core/src/opencl/copymakeborder.cl b/modules/core/src/opencl/copymakeborder.cl

index bb26442..dbb00b9 100644 (file)
--- a/modules/core/src/opencl/copymakeborder.cl
+++ b/modules/core/src/opencl/copymakeborder.cl
@@ -47,9 +47,9 @@
  #elif defined BORDER_REPLICATE
  #define EXTRAPOLATE(x, y, v) \
      { \
-        x = max(min(x, src_cols - 1), 0); \
-        y = max(min(y, src_rows - 1), 0); \
-        v = *(__global const T *)(srcptr + mad24(y, src_step, x * (int)sizeof(T) + src_offset)); \
+        x = clamp(x, 0, src_cols - 1); \
+        y = clamp(y, 0, src_rows - 1); \
+        v = *(__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); \
      }
  #elif defined BORDER_WRAP
  #define EXTRAPOLATE(x, y, v) \
@@ -63,7 +63,7 @@
              y -= ((y - src_rows + 1) / src_rows) * src_rows; \
          if( y >= src_rows ) \
              y %= src_rows; \
-        v = *(__global const T *)(srcptr + mad24(y, src_step, x * (int)sizeof(T) + src_offset)); \
+        v = *(__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); \
      }
  #elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
  #ifdef BORDER_REFLECT
@@ -97,7 +97,7 @@
                      y = src_rows - 1 - (y - src_rows) - delta; \
              } \
              while (y >= src_rows || y < 0); \
-        v = *(__global const T *)(srcptr + mad24(y, src_step, x * (int)sizeof(T) + src_offset)); \
+        v = *(__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); \
      }
  #else
  #error No extrapolation method
@@ -117,14 +117,14 @@ __kernel void copyMakeBorder(__global const uchar * srcptr, int src_step, int sr
          int src_x = x - left;
          int src_y = y - top;
  
-        int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset);
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T), dst_offset));
          __global T * dst = (__global T *)(dstptr + dst_index);
  
          if (NEED_EXTRAPOLATION(src_x, src_y))
              EXTRAPOLATE(src_x, src_y, dst[0])
          else
          {
-            int src_index = mad24(src_y, src_step, src_x * (int)sizeof(T) + src_offset);
+            int src_index = mad24(src_y, src_step, mad24(src_x, (int)sizeof(T), src_offset));
              __global const T * src = (__global const T *)(srcptr + src_index);
              dst[0] = src[0];
          }
diff --git a/modules/core/src/opencl/copyset.cl b/modules/core/src/opencl/copyset.cl

index cbafe67..42796ea 100644 (file)
--- a/modules/core/src/opencl/copyset.cl
+++ b/modules/core/src/opencl/copyset.cl
@@ -44,8 +44,8 @@
  #ifdef COPY_TO_MASK
  
  #define DEFINE_DATA \
-    int src_index = mad24(y, src_step, x*(int)sizeof(T)*scn + src_offset); \
-    int dst_index = mad24(y, dst_step, x*(int)sizeof(T)*scn + dst_offset); \
+    int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T) * scn, src_offset)); \
+    int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T) * scn, dst_offset)); \
       \
      __global const T * src = (__global const T *)(srcptr + src_index); \
      __global T * dst = (__global T *)(dstptr + dst_index)
@@ -60,7 +60,7 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of
  
      if (x < dst_cols && y < dst_rows)
      {
-        int mask_index = mad24(y, mask_step, x * mcn + mask_offset);
+        int mask_index = mad24(y, mask_step, mad24(x, mcn, mask_offset));
          __global const uchar * mask = (__global const uchar *)(maskptr + mask_index);
  
  #if mcn == 1
@@ -93,10 +93,10 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of
  
  #if cn != 3
  #define value value_
-#define storedst(val) *(__global dstT*)(dstptr + dst_index) = val
+#define storedst(val) *(__global dstT *)(dstptr + dst_index) = val
  #else
  #define value (dstT)(value_.x, value_.y, value_.z)
-#define storedst(val) vstore3(val, 0, (__global dstT1*)(dstptr + dst_index))
+#define storedst(val) vstore3(val, 0, (__global dstT1 *)(dstptr + dst_index))
  #endif
  
  __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
@@ -111,7 +111,7 @@ __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
          int mask_index = mad24(y, maskstep, x + maskoffset);
          if( mask[mask_index] )
          {
-            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
+            int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset));
              storedst(value);
          }
      }
@@ -125,7 +125,7 @@ __kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
  
      if (x < cols && y < rows)
      {
-        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
+        int dst_index  = mad24(y, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset));
          storedst(value);
      }
  }
diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl

index 4e53041..0c874db 100644 (file)
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@@ -50,11 +50,11 @@ __kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int sr
  
      if (x < cols && y < thread_rows)
      {
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, srcoffset + x * sizeoftype));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, srcoffset + x * sizeoftype));
+        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
+        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x, sizeoftype, srcoffset)));
  
-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, dstoffset + x  * sizeoftype));
-        __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, dstoffset + x  * sizeoftype));
+        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
+        __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x, sizeoftype, dstoffset)));
  
          dst0[0] = src1[0];
          dst1[0] = src0[0];
@@ -70,11 +70,12 @@ __kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, i
  
      if (x < cols && y < thread_rows)
      {
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, x*sizeoftype + srcoffset));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, (cols - x - 1)*sizeoftype + srcoffset));
+        int x1 = cols - x - 1;
+        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
+        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x1, sizeoftype, srcoffset)));
  
-        __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, (cols - x - 1)*sizeoftype + dstoffset));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, x * sizeoftype + dstoffset));
+        __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x1, sizeoftype, dstoffset)));
+        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
  
          dst0[0] = src0[0];
          dst1[0] = src1[0];
@@ -90,11 +91,12 @@ __kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int sr
  
      if (x < thread_cols && y < rows)
      {
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, x * sizeoftype + srcoffset));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, (cols - x - 1)*sizeoftype + srcoffset));
+        int x1 = cols - x - 1;
+        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
+        __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x1, sizeoftype, srcoffset)));
  
-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, (cols - x - 1)*sizeoftype + dstoffset));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, x * sizeoftype + dstoffset));
+        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x1, sizeoftype, dstoffset)));
+        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
  
          dst1[0] = src1[0];
          dst0[0] = src0[0];
diff --git a/modules/core/src/opencl/inrange.cl b/modules/core/src/opencl/inrange.cl

index 7549cf3..b113859 100644 (file)
--- a/modules/core/src/opencl/inrange.cl
+++ b/modules/core/src/opencl/inrange.cl
@@ -64,23 +64,22 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_of
  
      if (x < dst_cols && y < dst_rows)
      {
-        int src1_index = mad24(y, src1_step, x*(int)sizeof(T)*cn + src1_offset);
+        int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
          int dst_index = mad24(y, dst_step, x + dst_offset);
          __global const T * src1 = (__global const T *)(src1ptr + src1_index);
          __global uchar * dst = dstptr + dst_index;
  
  #ifndef HAVE_SCALAR
-        int src2_index = mad24(y, src2_step, x*(int)sizeof(T)*cn + src2_offset);
-        int src3_index = mad24(y, src3_step, x*(int)sizeof(T)*cn + src3_offset);
+        int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
+        int src3_index = mad24(y, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
          __global const T * src2 = (__global const T *)(src2ptr + src2_index);
          __global const T * src3 = (__global const T *)(src3ptr + src3_index);
  #endif
  
          dst[0] = 255;
  
-        #pragma unroll
          for (int c = 0; c < cn; ++c)
-            if ( src2[c] > src1[c] || src3[c] < src1[c] )
+            if (src2[c] > src1[c] || src3[c] < src1[c])
              {
                  dst[0] = 0;
                  break;
diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl

index 8d58742..da92c2f 100644 (file)
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -34,14 +34,6 @@
  //
  //
  
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
  __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
                    __global const uchar * lutptr, int lut_step, int lut_offset,
                    __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
@@ -51,8 +43,8 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
  
      if (x < cols && y < rows)
      {
-        int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(srcT) * dcn);
-        int dst_index = mad24(y, dst_step, dst_offset + x * (int)sizeof(dstT) * dcn);
+        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
  
          __global const srcT * src = (__global const srcT *)(srcptr + src_index);
          __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
@@ -65,7 +57,7 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
  #else
          #pragma unroll
          for (int cn = 0; cn < dcn; ++cn)
-            dst[cn] = lut[src[cn] * dcn + cn];
+            dst[cn] = lut[mad24(src[cn], dcn, cn)];
  #endif
      }
  }
diff --git a/modules/core/src/opencl/mixchannels.cl b/modules/core/src/opencl/mixchannels.cl

index 7abd60a..bede20c 100644 (file)
--- a/modules/core/src/opencl/mixchannels.cl
+++ b/modules/core/src/opencl/mixchannels.cl
@@ -46,9 +46,9 @@
  #define DECLARE_OUTPUT_MAT(i) \
      __global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
  #define PROCESS_ELEM(i) \
-    int src##i##_index = mad24(src##i##_step, y, x * (int)sizeof(T) * scn##i + src##i##_offset); \
+    int src##i##_index = mad24(src##i##_step, y, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \
      __global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \
-    int dst##i##_index = mad24(dst##i##_step, y, x * (int)sizeof(T) * dcn##i + dst##i##_offset); \
+    int dst##i##_index = mad24(dst##i##_step, y, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset)); \
      __global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \
      dst##i[0] = src##i[0];
  
diff --git a/modules/core/src/opencl/mulspectrums.cl b/modules/core/src/opencl/mulspectrums.cl

index 248ff00..817331e 100644 (file)
--- a/modules/core/src/opencl/mulspectrums.cl
+++ b/modules/core/src/opencl/mulspectrums.cl
@@ -45,7 +45,7 @@
  
  inline float2 cmulf(float2 a, float2 b)
  {
-    return (float2)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
+    return (float2)(mad(a.x, b.x, - a.y * b.y), mad(a.x, b.y, a.y * b.x));
  }
  
  inline float2 conjf(float2 a)
@@ -63,9 +63,9 @@ __kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step
  
      if (x < dst_cols && y < dst_rows)
      {
-        int src1_index = mad24(y, src1_step, x * (int)sizeof(float2) + src1_offset);
-        int src2_index = mad24(y, src2_step, x * (int)sizeof(float2) + src2_offset);
-        int dst_index = mad24(y, dst_step, x * (int)sizeof(float2) + dst_offset);
+        int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(float2), src1_offset));
+        int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(float2), src2_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(float2), dst_offset));
  
          float2 src0 = *(__global const float2 *)(src1ptr + src1_index);
          float2 src1 = *(__global const float2 *)(src2ptr + src2_index);
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl

index febc1cb..0a0538e 100644 (file)
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -58,20 +58,34 @@
  #define EXTRA_PARAMS
  #endif
  
+// accumulative reduction stuff
  #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
  #ifdef OP_DOT
-#define FUNC(a, b, c) a += b * c
+#if ddepth <= 4
+#define FUNC(a, b, c) a = mad24(b, c, a)
+#else
+#define FUNC(a, b, c) a = mad(b, c, a)
+#endif
+
  #elif defined OP_SUM
  #define FUNC(a, b) a += b
+
  #elif defined OP_SUM_ABS
  #define FUNC(a, b) a += b >= (dstT)(0) ? b : -b
+
  #elif defined OP_SUM_SQR
-#define FUNC(a, b) a += b * b
+#if ddepth <= 4
+#define FUNC(a, b) a = mad24(b, b, a)
+#else
+#define FUNC(a, b) a = mad(b, b, a)
+#endif
  #endif
+
  #define DECLARE_LOCAL_MEM \
      __local dstT localmem[WGS2_ALIGNED]
  #define DEFINE_ACCUMULATOR \
      dstT accumulator = (dstT)(0)
+
  #ifdef HAVE_MASK
  #define REDUCE_GLOBAL \
      dstT temp = convertToDT(src[0]); \
@@ -80,7 +94,7 @@
          FUNC(accumulator, temp)
  #elif defined OP_DOT
  #define REDUCE_GLOBAL \
-    int src2_index = mad24(id / cols, src2_step, src2_offset + (id % cols) * (int)sizeof(srcT)); \
+    int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \
      __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \
      dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \
      FUNC(accumulator, temp, temp2)
@@ -89,6 +103,7 @@
      dstT temp = convertToDT(src[0]); \
      FUNC(accumulator, temp)
  #endif
+
  #define SET_LOCAL_1 \
      localmem[lid] = accumulator
  #define REDUCE_LOCAL_1 \
@@ -99,6 +114,7 @@
      __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
      dst[0] = localmem[0]
  
+// countNonZero stuff
  #elif defined OP_COUNT_NON_ZERO
  #define dstT int
  #define DECLARE_LOCAL_MEM \
@@ -118,6 +134,7 @@
      __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
      dst[0] = localmem[0]
  
+// minMaxLoc stuff
  #elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
  
  #ifdef DEPTH_0
@@ -179,8 +196,8 @@
  #define REDUCE_LOCAL_1 \
      srcT oldmin = localmem_min[lid-WGS2_ALIGNED]; \
      srcT oldmax = localmem_max[lid-WGS2_ALIGNED]; \
-    localmem_min[lid - WGS2_ALIGNED] = min(minval,localmem_min[lid-WGS2_ALIGNED]); \
-    localmem_max[lid - WGS2_ALIGNED] = max(maxval,localmem_max[lid-WGS2_ALIGNED]); \
+    localmem_min[lid - WGS2_ALIGNED] = min(minval, localmem_min[lid-WGS2_ALIGNED]); \
+    localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid-WGS2_ALIGNED]); \
      srcT minv = localmem_min[lid - WGS2_ALIGNED], maxv = localmem_max[lid - WGS2_ALIGNED]; \
      localmem_minloc[lid - WGS2_ALIGNED] = (minv == minval) ? (minv == oldmin) ? \
          min(minloc, localmem_minloc[lid-WGS2_ALIGNED]) : minloc : localmem_minloc[lid-WGS2_ALIGNED]; \
@@ -233,15 +250,17 @@
  
  #else
  #error "No operation"
-#endif
+#endif // end of minMaxLoc stuff
  
  #ifdef OP_MIN_MAX_LOC
  #undef EXTRA_PARAMS
  #define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2
+
  #elif defined OP_MIN_MAX_LOC_MASK
  #undef EXTRA_PARAMS
  #define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2, \
      __global const uchar * maskptr, int mask_step, int mask_offset
+
  #elif defined OP_DOT
  #undef EXTRA_PARAMS
  #define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset
@@ -259,7 +278,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
  
      for (int grain = groupnum * WGS; id < total; id += grain)
      {
-        int src_index = mad24(id / cols, src_step, src_offset + (id % cols) * (int)sizeof(srcT));
+        int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset));
          __global const srcT * src = (__global const srcT *)(srcptr + src_index);
          REDUCE_GLOBAL;
      }
diff --git a/modules/core/src/opencl/reduce2.cl b/modules/core/src/opencl/reduce2.cl

index f8ff6a2..ef6a860 100644 (file)
--- a/modules/core/src/opencl/reduce2.cl
+++ b/modules/core/src/opencl/reduce2.cl
@@ -98,7 +98,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
      int x = get_global_id(0);
      if (x < cols)
      {
-        int src_index = x * (int)sizeof(srcT) * cn + src_offset;
+        int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset);
          __global dstT * dst = (__global dstT *)(dstptr + dst_offset) + x * cn;
          dstT tmp[cn] = { INIT_VALUE };
  
diff --git a/modules/core/src/opencl/set_identity.cl b/modules/core/src/opencl/set_identity.cl

index de8caaf..d63ce79 100644 (file)
--- a/modules/core/src/opencl/set_identity.cl
+++ b/modules/core/src/opencl/set_identity.cl
@@ -51,7 +51,7 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset,
  
      if (x < cols && y < rows)
      {
-        int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(T));
+        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
          __global T * src = (__global T *)(srcptr + src_index);
  
          src[0] = x == y ? scalar : (T)(0);
diff --git a/modules/core/src/opencl/split_merge.cl b/modules/core/src/opencl/split_merge.cl

index d246275..8a1bc49 100644 (file)
--- a/modules/core/src/opencl/split_merge.cl
+++ b/modules/core/src/opencl/split_merge.cl
@@ -45,7 +45,7 @@
  
  #define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,
  #define DECLARE_DATA(index) __global const T * src##index = \
-    (__global T *)(src##index##ptr + mad24(src##index##_step, y, x * (int)sizeof(T) + src##index##_offset));
+    (__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T), src##index##_offset)));
  #define PROCESS_ELEM(index) dst[index] = src##index[0];
  
  __kernel void merge(DECLARE_SRC_PARAMS_N
@@ -58,7 +58,7 @@ __kernel void merge(DECLARE_SRC_PARAMS_N
      if (x < cols && y < rows)
      {
          DECLARE_DATA_N
-        __global T * dst = (__global T *)(dstptr + mad24(dst_step, y, x * (int)sizeof(T) * cn + dst_offset));
+        __global T * dst = (__global T *)(dstptr + mad24(dst_step, y, mad24(x, (int)sizeof(T) * cn, dst_offset)));
          PROCESS_ELEMS_N
      }
  }
@@ -67,7 +67,7 @@ __kernel void merge(DECLARE_SRC_PARAMS_N
  
  #define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset
  #define DECLARE_DATA(index) __global T * dst##index = \
-    (__global T *)(dst##index##ptr + mad24(y, dst##index##_step, x * (int)sizeof(T) + dst##index##_offset));
+    (__global T *)(dst##index##ptr + mad24(y, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset)));
  #define PROCESS_ELEM(index) dst##index[0] = src[index];
  
  __kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS)
@@ -78,7 +78,7 @@ __kernel void split(__global uchar* srcptr, int src_step, int src_offset, int ro
      if (x < cols && y < rows)
      {
          DECLARE_DATA_N
-        __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, x *  cn * (int)sizeof(T) + src_offset));
+        __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, cn * (int)sizeof(T), src_offset)));
          PROCESS_ELEMS_N
      }
  }
diff --git a/modules/core/src/opencl/transpose.cl b/modules/core/src/opencl/transpose.cl

index da9608c..575cdab 100644 (file)
--- a/modules/core/src/opencl/transpose.cl
+++ b/modules/core/src/opencl/transpose.cl
@@ -60,7 +60,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
      }
      else
      {
-        int bid = gp_x + gs_x * gp_y;
+        int bid = mad24(gs_x, gp_y, gp_x);
          groupId_y =  bid % gs_y;
          groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
      }
@@ -68,23 +68,23 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
      int lx = get_local_id(0);
      int ly = get_local_id(1);
  
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
+    int x = mad24(groupId_x, TILE_DIM, lx);
+    int y = mad24(groupId_y, TILE_DIM, ly);
  
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
+    int x_index = mad24(groupId_y, TILE_DIM, lx);
+    int y_index = mad24(groupId_x, TILE_DIM, ly);
  
      __local T title[TILE_DIM * LDS_STEP];
  
      if (x < src_cols && y < src_rows)
      {
-        int index_src = mad24(y, src_step, x * (int)sizeof(T) + src_offset);
+        int index_src = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
  
          for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
              if (y + i < src_rows)
              {
                  __global const T * src = (__global const T *)(srcptr + index_src);
-                title[(ly + i) * LDS_STEP + lx] = src[0];
+                title[mad24(ly + i, LDS_STEP, lx)] = src[0];
                  index_src = mad24(BLOCK_ROWS, src_step, index_src);
              }
      }
@@ -92,13 +92,13 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
  
      if (x_index < src_rows && y_index < src_cols)
      {
-        int index_dst = mad24(y_index, dst_step, x_index * (int)sizeof(T) + dst_offset);
+        int index_dst = mad24(y_index, dst_step, mad24(x_index, (int)sizeof(T), dst_offset));
  
          for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
              if ((y_index + i) < src_cols)
              {
                  __global T * dst = (__global T *)(dstptr + index_dst);
-                dst[0] = title[lx * LDS_STEP + ly + i];
+                dst[0] = title[mad24(lx, LDS_STEP, ly + i)];
                  index_dst = mad24(BLOCK_ROWS, dst_step, index_dst);
              }
      }
@@ -111,8 +111,8 @@ __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_o
  
      if (y < src_rows && x < y)
      {
-        int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(T));
-        int dst_index = mad24(x, src_step, src_offset + y * (int)sizeof(T));
+        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
+        int dst_index = mad24(x, src_step, mad24(y, (int)sizeof(T), src_offset));
  
          __global T * src = (__global T *)(srcptr + src_index);
          __global T * dst = (__global T *)(srcptr + dst_index);
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp

index 2830bd1..ad07f93 100644 (file)
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -494,8 +494,8 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
      static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
      char cvt[40];
      ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
-                         ocl::typeToStr(type), ocl::typeToStr(dtype), ocl::convertTypeStr(depth, ddepth, cn, cvt),
+                  format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
+                         ocl::typeToStr(type), ocl::typeToStr(dtype), ddepth, ocl::convertTypeStr(depth, ddepth, cn, cvt),
                           opMap[sum_op], (int)wgs, wgs2_aligned,
                           doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                           haveMask ? " -D HAVE_MASK" : ""));
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp

index 578fefb..7ace751 100644 (file)
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -719,10 +719,14 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
      if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
              ((needDouble && doubleSupport) || !needDouble) )
      {
-        char cvt[40];
+        int wdepth = std::max(CV_32F, sdepth);
+
+        char cvt[2][40];
          ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
-                      format("-D srcT=%s -D dstT=%s -D convertToDT=%s%s", ocl::typeToStr(sdepth),
-                             ocl::typeToStr(ddepth), ocl::convertTypeStr(CV_32F, ddepth, 1, cvt),
+                      format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s%s",
+                             ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
+                             ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0]),
+                             ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1]),
                               doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
          if (!k.empty())
          {
@@ -731,7 +735,13 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
              UMat dst = _dst.getUMat();
  
              float alphaf = (float)alpha, betaf = (float)beta;
-            k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
+            ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+                    dstarg = ocl::KernelArg::WriteOnly(dst, cn);
+
+            if (wdepth == CV_32F)
+                k.args(srcarg, dstarg, alphaf, betaf);
+            else
+                k.args(srcarg, dstarg, alpha, beta);
  
              size_t globalsize[2] = { dst.cols * cn, dst.rows };
              if (k.run(2, globalsize, NULL, false))
@@ -838,8 +848,8 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
  
      char cvt[40];
      ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s",
-                         ocl::typeToStr(depth), ocl::typeToStr(ddepth), ocl::convertTypeStr(depth, ddepth, 1, cvt),
+                  format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt),
                           (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
      if (k.empty())
          return false;
author	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Mon, 24 Feb 2014 20:29:17 +0000 (00:29 +0400)
committer	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Tue, 25 Feb 2014 10:14:42 +0000 (14:14 +0400)
modules/core/include/opencv2/core/opencl/ocl_defs.hpp		patch \| blob \| history
modules/core/src/arithm.cpp		patch \| blob \| history
modules/core/src/convert.cpp		patch \| blob \| history
modules/core/src/mathfuncs.cpp		patch \| blob \| history
modules/core/src/matmul.cpp		patch \| blob \| history
modules/core/src/opencl/arithm.cl		patch \| blob \| history
modules/core/src/opencl/convert.cl		patch \| blob \| history
modules/core/src/opencl/copymakeborder.cl		patch \| blob \| history
modules/core/src/opencl/copyset.cl		patch \| blob \| history
modules/core/src/opencl/flip.cl		patch \| blob \| history
modules/core/src/opencl/inrange.cl		patch \| blob \| history
modules/core/src/opencl/lut.cl		patch \| blob \| history
modules/core/src/opencl/mixchannels.cl		patch \| blob \| history
modules/core/src/opencl/mulspectrums.cl		patch \| blob \| history
modules/core/src/opencl/reduce.cl		patch \| blob \| history
modules/core/src/opencl/reduce2.cl		patch \| blob \| history
modules/core/src/opencl/set_identity.cl		patch \| blob \| history
modules/core/src/opencl/split_merge.cl		patch \| blob \| history
modules/core/src/opencl/transpose.cl		patch \| blob \| history
modules/core/src/stat.cpp		patch \| blob \| history
modules/core/src/umatrix.cpp		patch \| blob \| history