From f3ee1c3d2fb11c10acabfd2b4993fca6dbe50f71 Mon Sep 17 00:00:00 2001
From: krodyush <konstantin.rodyushkin@intel.com>
Date: Tue, 17 Dec 2013 14:06:14 +0400
Subject: [PATCH] Changes the datatype of the angle of the gradient for Intel
 platforms.

---
 modules/ocl/src/hog.cpp                 | 24 ++++++++++++++++++++----
 modules/ocl/src/opencl/objdetect_hog.cl | 18 +++++++++++++-----
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index 68f3949..1f8afe5 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -76,6 +76,11 @@ namespace cv
                 int cdescr_width;
                 int cdescr_height;
 
+                // A shift value and type that allows qangle to be different
+                // sizes on different hardware
+                int qangle_step_shift;
+                int qangle_type;
+
                 void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                                       int nblocks_win_x, int nblocks_win_y);
 
@@ -153,6 +158,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
         hog_device_cpu = true;
     else
         hog_device_cpu = false;
+
 }
 
 size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@@ -213,7 +219,7 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
         effect_size = img.size();
 
     grad.create(img.size(), CV_32FC2);
-    qangle.create(img.size(), CV_8UC2);
+    qangle.create(img.size(), hog::qangle_type);
 
     const size_t block_hist_size = getBlockHistogramSize();
     const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
@@ -1607,6 +1613,16 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
 
     int descr_size = descr_width * nblocks_win_y;
     cdescr_size = descr_size;
+
+    qangle_type = CV_8UC2;
+    qangle_step_shift = 0;
+    // Some Intel devices have low single-byte access performance,
+    // so we change the datatype here.
+    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+    {
+        qangle_type = CV_32SC2;
+        qangle_step_shift = 2;
+    }
 }
 
 void cv::ocl::device::hog::compute_hists(int nbins,
@@ -1628,7 +1644,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
     int blocks_total = img_block_width * img_block_height;
 
     int grad_quadstep = grad.step >> 2;
-    int qangle_step = qangle.step;
+    int qangle_step = qangle.step >> qangle_step_shift;
 
     int blocks_in_group = 4;
     size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
@@ -1892,7 +1908,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
     char correctGamma = (correct_gamma) ? 1 : 0;
     int img_step = img.step;
     int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
+    int qangle_step = qangle.step >> (1 + qangle_step_shift);
 
     args.push_back( make_pair( sizeof(cl_int), (void *)&height));
     args.push_back( make_pair( sizeof(cl_int), (void *)&width));
@@ -1927,7 +1943,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
     char correctGamma = (correct_gamma) ? 1 : 0;
     int img_step = img.step >> 2;
     int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
+    int qangle_step = qangle.step >> (1 + qangle_step_shift);
 
     args.push_back( make_pair( sizeof(cl_int), (void *)&height));
     args.push_back( make_pair( sizeof(cl_int), (void *)&width));
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
index 0d2f26f..60d7346 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -50,6 +50,14 @@
 #define NTHREADS 256
 #define CV_PI_F 3.1415926535897932384626433832795f
 
+#ifdef INTEL_DEVICE
+#define QANGLE_TYPE		int
+#define QANGLE_TYPE2	int2
+#else
+#define QANGLE_TYPE		uchar
+#define QANGLE_TYPE2	uchar2
+#endif
+
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
@@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel(
     const int cnbins, const int cblock_hist_size, const int img_block_width,
     const int blocks_in_group, const int blocks_total,
     const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
+    __global const float* grad, __global const QANGLE_TYPE* qangle,
     __global const float* gauss_w_lut,
     __global float* block_hists, __local float* smem)
 {
@@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel(
 
     __global const float* grad_ptr = (gid < blocks_total) ?
         grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+    __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
         qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
 
     __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
@@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel(
     for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
     {
         float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+        QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
 
         grad_ptr += grad_quadstep;
         qangle_ptr += qangle_step;
@@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel(
 __kernel void compute_gradients_8UC4_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
@@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel(
 __kernel void compute_gradients_8UC1_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
-- 
2.7.4