From b18101b15aab3611696c14d9cbd9c445f73e328d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sat, 12 Oct 2013 00:58:58 +0400
Subject: [PATCH] some optimization of binary ocl::bitwise operations

---
 modules/ocl/src/arithm.cpp                         | 54 +++++++++++++++-------
 modules/ocl/src/opencl/arithm_add.cl               |  6 +--
 modules/ocl/src/opencl/arithm_add_scalar.cl        | 12 ++---
 modules/ocl/src/opencl/arithm_add_scalar_mask.cl   |  8 ++--
 modules/ocl/src/opencl/arithm_bitwise_binary.cl    | 21 +++++++--
 .../ocl/src/opencl/arithm_bitwise_binary_mask.cl   | 19 +++++++-
 .../ocl/src/opencl/arithm_bitwise_binary_scalar.cl | 24 ++++++++--
 .../opencl/arithm_bitwise_binary_scalar_mask.cl    | 23 +++++++--
 8 files changed, 124 insertions(+), 43 deletions(-)
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 2d54385..5a3820e 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -56,6 +56,23 @@
 using namespace cv;
 using namespace cv::ocl;
 
+static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
+{
+    CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
+
+    static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
+                               sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
+
+    int elemSize1 = sizeMap[depth];
+    int bufSize = elemSize1 * ocn;
+    std::vector<uchar> _buf(bufSize);
+    uchar * buf = &_buf[0];
+    scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
+    memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
+
+    return _buf;
+}
+
 //////////////////////////////////////////////////////////////////////////////
 /////////////// add subtract multiply divide min max /////////////////////////
 //////////////////////////////////////////////////////////////////////////////
@@ -84,7 +101,7 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
     int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
     int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize();
     int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
-    oclMat m;
+    std::vector<uchar> m;
 
     size_t localThreads[3]  = { 16, 16, 1 };
     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
@@ -132,10 +149,9 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
     if (haveScalar)
     {
         const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F };
-        m.create(1, 1, CV_MAKE_TYPE(WDepthMap[WDepth], oclChannels));
-        m.setTo(scalar);
+        m = scalarToVector(scalar, WDepthMap[WDepth], oclChannels, src1.channels());
 
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data ));
+        args.push_back( make_pair( m.size(), (void *)&m[0]));
 
         kernelName += "_scalar";
     }
@@ -1329,6 +1345,13 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName
 
 enum { AND = 0, OR, XOR };
 
+static std::string to_string(int value)
+{
+    std::ostringstream stream;
+    stream << value;
+    return stream.str();
+}
+
 static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask,
                                oclMat &dst, int operationType)
 {
@@ -1337,17 +1360,20 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
     CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
 
     dst.create(src1.size(), src1.type());
-
-    int elemSize = dst.elemSize();
-    int cols1 = dst.cols * elemSize;
     oclMat m;
 
     const char operationMap[] = { '&', '|', '^' };
     std::string kernelName("arithm_bitwise_binary");
-    std::string buildOptions = format("-D Operation=%c", operationMap[operationType]);
+
+    int vlen = std::min<int>(8, src1.elemSize1() * src1.oclchannels());
+    std::string vlenstr = vlen > 1 ? to_string(vlen) : "";
+    std::string buildOptions = format("-D Operation=%c -D vloadn=vload%s -D vstoren=vstore%s -D elemSize=%d -D vlen=%d"
+                                      " -D ucharv=uchar%s",
+                                      operationMap[operationType], vlenstr.c_str(), vlenstr.c_str(),
+                                      (int)src1.elemSize(), vlen, vlenstr.c_str());
 
     size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { cols1, dst.rows, 1 };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
@@ -1360,7 +1386,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
         m.setTo(src3);
 
         args.push_back( make_pair( sizeof(cl_mem), (void *)&m.data ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize ) );
 
         kernelName += "_scalar";
     }
@@ -1377,9 +1402,6 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
         args.push_back( make_pair( sizeof(cl_int), (void *)&mask.step ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&mask.offset ));
 
-        if (!src2.empty())
-            args.push_back( make_pair( sizeof(cl_int), (void *)&elemSize ));
-
         kernelName += "_mask";
     }
 
@@ -1387,7 +1409,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
 
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
 
     openCLExecuteKernel(src1.clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) :
@@ -1400,12 +1422,12 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 {
     if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_OpenCLDoubleNotSupported, "selected device doesn't support double");
+        CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
         return;
     }
 
     dst.create(src.size(), src.type());
-    bitwise_unary_run(src, dst,  "arithm_bitwise_not", &arithm_bitwise_not);
+    bitwise_unary_run(src, dst, "arithm_bitwise_not", &arithm_bitwise_not);
 }
 
 void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
index cd9fae6..2f34bbb 100644
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -62,7 +62,7 @@
 
 #if defined (FUNC_MUL)
 #if defined (HAVE_SCALAR)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index]));
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar * convertToWT(src2[src2_index]));
 #else
 #define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index]));
 #endif
@@ -72,7 +72,7 @@
 #if defined (HAVE_SCALAR)
 #define EXPRESSION T zero = (T)(0); \
     dst[dst_index] = src2[src2_index] == zero ? zero : \
-    convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index]));
+    convertToT(convertToWT(src1[src1_index]) * scalar / convertToWT(src2[src2_index]));
 #else
 #define EXPRESSION T zero = (T)(0); \
     dst[dst_index] = src2[src2_index] == zero ? zero : \
@@ -123,7 +123,7 @@ __kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_off
 // add mat with scale
 __kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
                                           __global T *src2, int src2_step, int src2_offset,
-                                          __global WT *scalar,
+                                          WT scalar,
                                           __global T *dst, int dst_step,  int dst_offset,
                                           int cols, int rows)
 {
diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl
index 671bd12..7f4e413 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -52,20 +52,20 @@
 #endif
 
 #if defined (FUNC_ADD)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]);
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar);
 #endif
 
 #if defined (FUNC_SUB)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]);
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar);
 #endif
 
 #if defined (FUNC_MUL)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]);
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar);
 #endif
 
 #if defined (FUNC_DIV)
 #define EXPRESSION T zero = (T)(0); \
-    dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index]));
+    dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar / convertToWT(src1[src1_index]));
 #endif
 
 #if defined (FUNC_ABS)
@@ -75,7 +75,7 @@
 #endif
 
 #if defined (FUNC_ABS_DIFF)
-#define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar[0]; \
+#define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar; \
     value = value > (WT)(0) ? value : -value; \
     dst[dst_index] = convertToT(value);
 #endif
@@ -85,7 +85,7 @@
 ///////////////////////////////////////////////////////////////////////////////////
 
 __kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset,
-                                 __global WT *scalar,
+                                 WT scalar,
                                  __global T *dst,  int dst_step,  int dst_offset,
                                  int cols, int rows)
 {
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
index d472b3c..b93de0c 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -52,15 +52,15 @@
 #endif
 
 #if defined (FUNC_ADD)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar[0]);
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar);
 #endif
 
 #if defined (FUNC_SUB)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar[0]);
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar);
 #endif
 
 #if defined (FUNC_MUL)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0]);
+#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar);
 #endif
 
 #if defined (FUNC_DIV)
@@ -74,7 +74,7 @@
 ///////////////////////////////////////////////////////////////////////////////////
 
 __kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset,
-                                     __global WT *scalar,
+                                     WT scalar,
                                      __global uchar *mask, int mask_step, int mask_offset,
                                      __global T *dst,  int dst_step,  int dst_offset,
                                      int cols, int rows)
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary.cl b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
index 898b40a..a4fa205 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
@@ -51,17 +51,32 @@
 __kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset,
                                     __global uchar * src2, int src2_step, int src2_offset,
                                     __global uchar * dst, int dst_step, int dst_offset,
-                                    int cols1, int rows)
+                                    int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols1 && y < rows)
+    if (x < cols && y < rows)
     {
+#if elemSize > 1
+        x *= elemSize;
+#endif
         int src1_index = mad24(y, src1_step, x + src1_offset);
         int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index = mad24(y, dst_step, dst_offset + x);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
+#if elemSize > 1
+        #pragma unroll
+        for (int i = 0; i < elemSize; i += vlen)
+        {
+            ucharv t0 = vloadn(0, src1 + src1_index + i);
+            ucharv t1 = vloadn(0, src2 + src2_index + i);
+            ucharv t2 = t0 Operation t1;
+
+            vstoren(t2, 0, dst + dst_index + i);
+        }
+#else
         dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+#endif
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
index 622ab5b..d244e57 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
@@ -50,7 +50,7 @@
 
 __kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset,
                                     __global uchar * src2, int src2_step, int src2_offset,
-                                    __global uchar * mask, int mask_step, int mask_offset, int elemSize,
+                                    __global uchar * mask, int mask_step, int mask_offset,
                                     __global uchar * dst, int dst_step, int dst_offset,
                                     int cols1, int rows)
 {
@@ -59,15 +59,30 @@ __kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, i
 
     if (x < cols1 && y < rows)
     {
-        int mask_index = mad24(y, mask_step, mask_offset + (x / elemSize));
+        int mask_index = mad24(y, mask_step, mask_offset + x);
 
         if (mask[mask_index])
         {
+#if elemSize > 1
+                x *= elemSize;
+#endif
             int src1_index = mad24(y, src1_step, x + src1_offset);
             int src2_index = mad24(y, src2_step, x + src2_offset);
             int dst_index = mad24(y, dst_step, x + dst_offset);
 
+#if elemSize > 1
+            #pragma unroll
+            for (int i = 0; i < elemSize; i += vlen)
+            {
+                ucharv t0 = vloadn(0, src1 + src1_index + i);
+                ucharv t1 = vloadn(0, src2 + src2_index + i);
+                ucharv t2 = t0 Operation t1;
+
+                vstoren(t2, 0, dst + dst_index + i);
+            }
+#else
             dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+#endif
         }
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
index c17b412..5a7d593 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
@@ -50,19 +50,33 @@
 
 __kernel void arithm_bitwise_binary_scalar(
         __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int elemSize,
+        __global uchar *src2,
         __global uchar *dst, int dst_step, int dst_offset,
-        int cols1, int rows)
+        int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols1 && y < rows)
+    if (x < cols && y < rows)
     {
+#if elemSize > 1
+        x *= elemSize;
+#endif
         int src1_index = mad24(y, src1_step, src1_offset + x);
-        int src2_index = x % elemSize;
         int dst_index  = mad24(y, dst_step, dst_offset + x);
 
-        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+#if elemSize > 1
+        #pragma unroll
+        for (int i = 0; i < elemSize; i += vlen)
+        {
+            ucharv t0 = vloadn(0, src1 + src1_index + i);
+            ucharv t1 = vloadn(0, src2 + i);
+            ucharv t2 = t0 Operation t1;
+
+            vstoren(t2, 0, dst + dst_index + i);
+        }
+#else
+        dst[dst_index] = src1[src1_index] Operation src2[0];
+#endif
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
index bae1699..a1876b5 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
@@ -56,7 +56,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
 __kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int elemSize,
+        __global uchar *src2,
         __global uchar *mask, int mask_step, int mask_offset,
         __global uchar *dst,  int dst_step,  int dst_offset,
         int cols, int rows)
@@ -66,14 +66,29 @@ __kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_s
 
     if (x < cols && y < rows)
     {
-        int mask_index = mad24(y, mask_step, (x / elemSize) + mask_offset);
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+
         if (mask[mask_index])
         {
+#if elemSize > 1
+            x *= elemSize;
+#endif
             int src1_index = mad24(y, src1_step, x + src1_offset);
-            int src2_index = x % elemSize;
             int dst_index = mad24(y, dst_step, x + dst_offset);
 
-            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+#if elemSize > 1
+            #pragma unroll
+            for (int i = 0; i < elemSize; i += vlen)
+            {
+                ucharv t0 = vloadn(0, src1 + src1_index + i);
+                ucharv t1 = vloadn(0, src2 + i);
+                ucharv t2 = t0 Operation t1;
+
+                vstoren(t2, 0, dst + dst_index + i);
+            }
+#else
+            dst[dst_index] = src1[src1_index] Operation src2[0];
+#endif
         }
     }
 }
-- 
2.7.4