From 7f2662b310489d3336cadf46c386d271ebf09ae0 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 9 Jun 2014 00:50:14 +0400
Subject: [PATCH] fixes

---
 .../core/include/opencv2/core/opencl/ocl_defs.hpp  |  2 +-
 modules/core/src/opencl/minmaxloc.cl               | 40 +++++++++++-------
 modules/core/src/opencl/reduce.cl                  | 47 ++++++++++++----------
 modules/core/src/stat.cpp                          | 19 ++++-----
 4 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
index 55f8849..76d4f84 100644
--- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
+++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
@@ -5,7 +5,7 @@
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-//#define CV_OPENCL_RUN_ASSERT
+#define CV_OPENCL_RUN_ASSERT
 
 #ifdef HAVE_OPENCL
 
diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index 11b6da9..56de655 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -15,16 +15,16 @@
 
 #ifdef DEPTH_0
 #define MIN_VAL 0
-#define MAX_VAL 255
+#define MAX_VAL UCHAR_MAX
 #elif defined DEPTH_1
-#define MIN_VAL -128
-#define MAX_VAL 127
+#define MIN_VAL SCHAR_MIN
+#define MAX_VAL SCHAR_MAX
 #elif defined DEPTH_2
 #define MIN_VAL 0
-#define MAX_VAL 65535
+#define MAX_VAL USHRT_MAX
 #elif defined DEPTH_3
-#define MIN_VAL -32768
-#define MAX_VAL 32767
+#define MIN_VAL SHRT_MIN
+#define MAX_VAL SHRT_MAX
 #elif defined DEPTH_4
 #define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
@@ -39,6 +39,14 @@
 #define noconvert
 #define INDEX_MAX UINT_MAX
 
+#if kercn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define srcTSIZE (int)sizeof(srcT1)
+#else
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define srcTSIZE ((int)sizeof(srcT1))
+#endif
+
 #ifdef NEED_MINLOC
 #define CALC_MINLOC(inc) minloc = id + inc
 #else
@@ -154,22 +162,22 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
         {
 #ifdef HAVE_SRC_CONT
-            src_index = mul24(id, (int)sizeof(srcT1));
+            src_index = mul24(id, srcTSIZE);
 #else
-            src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+            src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
 #endif
-            temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
+            temp = convertToDT(loadpix(srcptr + src_index));
 #ifdef OP_ABS
             temp = temp >= (dstT)(0) ? temp : -temp;
 #endif
 
 #ifdef HAVE_SRC2
 #ifdef HAVE_SRC2_CONT
-            src2_index = mul24(id, (int)sizeof(srcT1));
+            src2_index = mul24(id, srcTSIZE);
 #else
-            src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
+            src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
 #endif
-            temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
+            temp2 = convertToDT(loadpix(src2ptr + src2_index));
             temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
 #ifdef OP_CALC2
             temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
@@ -202,8 +210,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #elif kercn >= 2
             CALC_P(s0, 0)
             CALC_P(s1, 1)
-#if kercn >= 4
+#if kercn >= 3
             CALC_P(s2, 2)
+#if kercn >= 4
             CALC_P(s3, 3)
 #if kercn >= 8
             CALC_P(s4, 4)
@@ -223,6 +232,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #endif
 #endif
+#endif
         }
     }
 
@@ -335,9 +345,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #ifdef NEED_MAXLOC
         *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
-#endif
 #ifdef OP_CALC2
         pos = mad24(groupnum, (int)sizeof(uint), pos);
+#endif
+#endif
+#ifdef OP_CALC2
         *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
 #endif
     }
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 92818e3..9418cec 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -148,11 +148,9 @@
 
 #ifdef OP_CALC2
 #define DECLARE_LOCAL_MEM \
-    __local dstT localmem[WGS2_ALIGNED]; \
-    __local dstT localmem2[WGS2_ALIGNED]
+    __local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED]
 #define DEFINE_ACCUMULATOR \
-    dstT accumulator = (dstT)(0); \
-    dstT accumulator2 = (dstT)(0)
+    dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0)
 #else
 #define DECLARE_LOCAL_MEM \
     __local dstT localmem[WGS2_ALIGNED]
@@ -163,10 +161,10 @@
 #ifdef HAVE_SRC2
 #ifdef OP_CALC2
 #define PROCESS_ELEMS \
-    dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
     dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp -= temp2; \
-    temp = temp > (dstT)(0) ? temp : -temp; \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator2, temp2); \
     FUNC(accumulator, temp)
 #else
@@ -258,6 +256,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp); \
     FUNC(accumulator2, temp2)
 #elif kercn == 2
@@ -265,6 +264,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator2, temp2.s0); \
@@ -274,6 +274,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -287,6 +288,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -308,6 +310,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -452,6 +455,20 @@
 #endif
 #endif
 
+#ifdef OP_CALC2
+#define SET_LOCAL_1 \
+    localmem[lid] = accumulator; \
+    localmem2[lid] = accumulator2
+#define REDUCE_LOCAL_1 \
+    localmem[lid - WGS2_ALIGNED] += accumulator; \
+    localmem2[lid - WGS2_ALIGNED] += accumulator2
+#define REDUCE_LOCAL_2 \
+    localmem[lid] += localmem[lid2]; \
+    localmem2[lid] += localmem2[lid2]
+#define CALC_RESULT \
+    storepix(localmem[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid))
+#else
 #define SET_LOCAL_1 \
     localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -460,6 +477,7 @@
     localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
     storepix(localmem[0], dstptr + dstTSIZE * gid)
+#endif
 
 // countNonZero stuff
 #elif defined OP_COUNT_NON_ZERO
@@ -516,20 +534,6 @@
     accumulator += value.sF == zero ? zero : one
 #endif
 
-#ifdef OP_CALC2
-#define SET_LOCAL_1 \
-    localmem[lid] = accumulator; \
-    localmem2[lid] = accumulator2; \
-#define REDUCE_LOCAL_1 \
-    localmem[lid - WGS2_ALIGNED] += accumulator; \
-    localmem2[lid - WGS2_ALIGNED] += accumulator2
-#define REDUCE_LOCAL_2 \
-    localmem[lid] += localmem[lid2]; \
-    localmem2[lid] += localmem2[lid2]
-#define CALC_RESULT \
-    storepix(localmem[0], dstptr + dstTSIZE * gid); \
-    storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
-#else
 #define SET_LOCAL_1 \
     localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -538,7 +542,6 @@
     localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
     storepix(localmem[0], dstptr + dstTSIZE * gid)
-#endif
 
 // norm (NORM_INF) with cn > 1 and mask
 #elif defined OP_NORM_INF_MASK
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 34c487a..01f50fa 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -550,9 +550,9 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
 
         Mat mres = db.getMat(ACCESS_READ);
         if (calc2)
-            const_cast<Scalar &>(res2) = func(mres.colRange(dbsize, dbsize));
+            const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
 
-        res = func(mres.colRange(0, dbsize));
+        res = func(mres.colRange(0, ngroups));
         return true;
     }
     return false;
@@ -1434,10 +1434,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
         haveSrc2 = _src2.kind() != _InputArray::NONE;
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
+            kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src));
 
-    CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
-              (cn >= 1 && _mask.empty() && !minLoc && !maxLoc) );
+    CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
+              (cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) );
 
     if (ddepth < 0)
         ddepth = depth;
@@ -1484,6 +1484,8 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
 
+    printf("%s\n", opts.c_str());
+
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())
         return false;
@@ -2556,9 +2558,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
 {
     Scalar sc1, sc2;
     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool relative = (normType & NORM_RELATIVE) != 0,
-        normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
+    bool relative = (normType & NORM_RELATIVE) != 0;
     normType &= ~NORM_RELATIVE;
+    bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
 
     if ( !(normType == NORM_INF || normsum) )
         return false;
@@ -2608,8 +2610,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
 
 #ifdef HAVE_OPENCL
     double _result = 0;
-    CV_OCL_RUN_(_src1.isUMat() && _src2.isUMat() &&
-                _src1.dims() <= 2 && _src2.dims() <= 2,
+    CV_OCL_RUN_(_src1.isUMat(),
                 ocl_norm(_src1, _src2, normType, _mask, _result),
                 _result)
 #endif
-- 
2.7.4