From 26c246140a31556fd116bb53044575a0f9b02b84 Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Wed, 19 Jun 2013 11:20:45 +0800
Subject: [PATCH] optimize hog

---
 modules/ocl/src/hog.cpp                 | 512 ++++++++++++++++++++-----------
 modules/ocl/src/opencl/objdetect_hog.cl | 520 ++++++++++++++++++++++----------
 2 files changed, 709 insertions(+), 323 deletions(-)

diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index a351458..3533cce 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Wenju He, wenju@multicorewareinc.com
+//     Wenju He, wenju@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -48,13 +48,107 @@ using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 
-
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
 #define CELLS_PER_BLOCK_X 2
 #define CELLS_PER_BLOCK_Y 2
 #define NTHREADS 256
 
+static oclMat gauss_w_lut;
+static bool hog_device_cpu;
+/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
+static const float gaussian_interp_lut[] = 
+{
+    /* gaussian lut */
+    0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f, 
+    0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f, 
+    0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f, 
+    0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f, 
+    0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f, 
+    0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
+    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
+    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
+    0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 
+    0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 
+    0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f, 
+    0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f, 
+    0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f, 
+    0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
+    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
+    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
+    0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 
+    0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 
+    0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f, 
+    0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f, 
+    0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f, 
+    0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f, 
+    0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f, 
+    0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f, 
+    0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 
+    0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 
+    0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f, 
+    0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f, 
+    0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f, 
+    0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
+    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
+    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
+    0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 
+    0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 
+    0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f, 
+    0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f, 
+    0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f, 
+    0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
+    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
+    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
+    0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 
+    0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 
+    0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f, 
+    /* interp_weight lut */
+    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
+    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
+    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f, 
+    0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 
+    0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 
+    0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f, 
+    0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f, 
+    0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f, 
+    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
+    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
+    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f, 
+    0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 
+    0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 
+    0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f, 
+    0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f, 
+    0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f, 
+    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
+    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
+    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f, 
+    0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 
+    0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 
+    0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f, 
+    0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f, 
+    0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f, 
+    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
+    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
+    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f, 
+    0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 
+    0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 
+    0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f, 
+    0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f, 
+    0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f, 
+    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
+    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
+    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f, 
+    0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 
+    0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 
+    0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f, 
+    0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f, 
+    0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f, 
+    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
+    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
+    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
+};
+
 namespace cv
 {
     namespace ocl
@@ -78,38 +172,43 @@ namespace cv
                 int cnblocks_win_x;
                 int cnblocks_win_y;
                 int cblock_hist_size;
-                int cblock_hist_size_2up;
                 int cdescr_size;
                 int cdescr_width;
+                int cdescr_height;
 
                 void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                                       int nblocks_win_x, int nblocks_win_y);
 
                 void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                                   int height, int width, const cv::ocl::oclMat &grad,
-                                   const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists);
+                                   int height, int width, float sigma, const cv::ocl::oclMat &grad,
+                                   const cv::ocl::oclMat &qangle, 
+                                   const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
 
                 void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                                     int height, int width, cv::ocl::oclMat &block_hists, float threshold);
+                                     int height, int width, cv::ocl::oclMat &block_hists, 
+                                     float threshold);
 
                 void classify_hists(int win_height, int win_width, int block_stride_y,
-                                    int block_stride_x, int win_stride_y, int win_stride_x, int height,
-                                    int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
+                                    int block_stride_x, int win_stride_y, int win_stride_x, 
+                                    int height, int width, const cv::ocl::oclMat &block_hists, 
+                                    const cv::ocl::oclMat &coefs, float free_coef,
                                     float threshold, cv::ocl::oclMat &labels);
 
-                void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                            int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+                void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, 
+                                            int block_stride_x, int win_stride_y, int win_stride_x, 
+                                            int height, int width, const cv::ocl::oclMat &block_hists,
                                             cv::ocl::oclMat &descriptors);
-                void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                            int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+                void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, 
+                                            int block_stride_x, int win_stride_y, int win_stride_x, 
+                                            int height, int width, const cv::ocl::oclMat &block_hists,
                                             cv::ocl::oclMat &descriptors);
 
                 void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
+                                            float angle_scale, cv::ocl::oclMat &grad, 
+                                            cv::ocl::oclMat &qangle, bool correct_gamma);
                 void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
-
-                void resize( const oclMat &src, oclMat &dst, const Size sz);
+                                            float angle_scale, cv::ocl::oclMat &grad, 
+                                            cv::ocl::oclMat &qangle, bool correct_gamma);
             }
         }
     }
@@ -117,8 +216,14 @@ namespace cv
 
 using namespace ::cv::ocl::device;
 
-cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
-                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
+static inline int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+
+cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, 
+                                      Size cell_size_, int nbins_, double win_sigma_, 
+                                      double threshold_L2hys_, bool gamma_correction_, int nlevels_)
     : win_size(win_size_),
       block_size(block_size_),
       block_stride(block_stride_),
@@ -132,19 +237,27 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
     CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
               (win_size.height - block_size.height) % block_stride.height == 0);
 
-    CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
+    CV_Assert(block_size.width % cell_size.width == 0 && 
+        block_size.height % cell_size.height == 0);
 
     CV_Assert(block_stride == cell_size);
 
     CV_Assert(cell_size == Size(8, 8));
 
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+    Size cells_per_block(block_size.width / cell_size.width, 
+        block_size.height / cell_size.height);
     CV_Assert(cells_per_block == Size(2, 2));
 
     cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
+    hog::set_up_constants(nbins, block_stride.width, block_stride.height, 
+        blocks_per_win.width, blocks_per_win.height);
 
     effect_size = Size(0, 0);
+
+	if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
+        hog_device_cpu = true;
+    else
+        hog_device_cpu = false;
 }
 
 size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@@ -154,7 +267,8 @@ size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
 
 size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const
 {
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+    Size cells_per_block = Size(block_size.width / cell_size.width, 
+        block_size.height / cell_size.height);
     return (size_t)(nbins * cells_per_block.area());
 }
 
@@ -167,7 +281,8 @@ bool cv::ocl::HOGDescriptor::checkDetectorSize() const
 {
     size_t detector_size = detector.rows * detector.cols;
     size_t descriptor_size = getDescriptorSize();
-    return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
+    return detector_size == 0 || detector_size == descriptor_size || 
+        detector_size == descriptor_size + 1;
 }
 
 void cv::ocl::HOGDescriptor::setSVMDetector(const vector<float> &_detector)
@@ -207,10 +322,16 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
 
     const size_t block_hist_size = getBlockHistogramSize();
     const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
-    block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
+    block_hists.create(1, 
+        static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
 
     Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
     labels.create(1, wins_per_img.area(), CV_8U);
+
+    vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut + 
+        sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
+    Mat m_lut(v_lut);
+    gauss_w_lut.upload(m_lut.reshape(1,1));
 }
 
 void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
@@ -221,29 +342,34 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
     switch (img.type())
     {
     case CV_8UC1:
-        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, 
+            angleScale, grad, qangle, gamma_correction);
         break;
     case CV_8UC4:
-        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, 
+            angleScale, grad, qangle, gamma_correction);
         break;
     }
 }
 
+
 void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
 {
-    computeGradient(img, grad, qangle);
+    computeGradient(img, this->grad, this->qangle);
 
-    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
-                       grad, qangle, (float)getWinSigma(), block_hists);
+    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
+        effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
 
-    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
-                         block_hists, (float)threshold_L2hys);
+    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
+        effect_size.width, block_hists, (float)threshold_L2hys);
 }
 
 
-void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, oclMat &descriptors, int descr_format)
+void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, 
+                                            oclMat &descriptors, int descr_format)
 {
-    CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+    CV_Assert(win_stride.width % block_stride.width == 0 && 
+        win_stride.height % block_stride.height == 0);
 
     init_buffer(img, win_stride);
 
@@ -253,17 +379,20 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
     Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
     Size wins_per_img   = numPartsWithin(effect_size, win_size, win_stride);
 
-    descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
+    descriptors.create(wins_per_img.area(), 
+        static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
 
     switch (descr_format)
     {
     case DESCR_FORMAT_ROW_BY_ROW:
-        hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+        hog::extract_descrs_by_rows(win_size.height, win_size.width, 
+            block_stride.height, block_stride.width, win_stride.height, win_stride.width, 
+            effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     case DESCR_FORMAT_COL_BY_COL:
-        hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+        hog::extract_descrs_by_cols(win_size.height, win_size.width, 
+            block_stride.height, block_stride.width, win_stride.height, win_stride.width, 
+            effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     default:
         CV_Error(CV_StsBadArg, "Unknown descriptor format");
@@ -271,7 +400,8 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
 }
 
 
-void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, double hit_threshold, Size win_stride, Size padding)
+void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, 
+                                    double hit_threshold, Size win_stride, Size padding)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
     CV_Assert(padding == Size(0, 0));
@@ -283,14 +413,16 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
     if (win_stride == Size())
         win_stride = block_stride;
     else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+        CV_Assert(win_stride.width % block_stride.width == 0 && 
+            win_stride.height % block_stride.height == 0);
     init_buffer(img, win_stride);
 
     computeBlockHistograms(img);
 
-    hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                        win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists,
-                        detector, (float)free_coef, (float)hit_threshold, labels);
+    hog::classify_hists(win_size.height, win_size.width, block_stride.height, 
+        block_stride.width, win_stride.height, win_stride.width, 
+        effect_size.height, effect_size.width, block_hists, detector, 
+        (float)free_coef, (float)hit_threshold, labels);
 
     labels.download(labels_host);
     unsigned char *vec = labels_host.ptr();
@@ -306,8 +438,9 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
 
 
 
-void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations, double hit_threshold,
-        Size win_stride, Size padding, double scale0, int group_threshold)
+void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations, 
+                                              double hit_threshold, Size win_stride, Size padding, 
+                                              double scale0, int group_threshold)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
     CV_Assert(scale0 > 1);
@@ -333,7 +466,8 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
     if (win_stride == Size())
         win_stride = block_stride;
     else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+        CV_Assert(win_stride.width % block_stride.width == 0 && 
+            win_stride.height % block_stride.height == 0);
     init_buffer(img, win_stride);
     image_scale.create(img.size(), img.type());
 
@@ -347,16 +481,18 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
         }
         else
         {
-            hog::resize( img, image_scale, effect_size);
+            resize(img, image_scale, effect_size);
             detect(image_scale, locations, hit_threshold, win_stride, padding);
         }
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
+        Size scaled_win_size(cvRound(win_size.width * scale), 
+            cvRound(win_size.height * scale));
         for (size_t j = 0; j < locations.size(); j++)
-            all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size));
+            all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, 
+              scaled_win_size));
     }
 
     found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
+    groupRectangles(found_locations, group_threshold, 0.2);
 }
 
 int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
@@ -364,9 +500,11 @@ int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
     return (size - part_size + stride) / stride;
 }
 
-cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
+cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, 
+                                                cv::Size stride)
 {
-    return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
+    return Size(numPartsWithin(size.width, part_size.width, stride.width), 
+        numPartsWithin(size.height, part_size.height, stride.height));
 }
 
 std::vector<float> cv::ocl::HOGDescriptor::getDefaultPeopleDetector()
@@ -1547,8 +1685,9 @@ static int power_2up(unsigned int n)
     return -1; // Input is too big
 }
 
-void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-        int nblocks_win_x, int nblocks_win_y)
+void cv::ocl::device::hog::set_up_constants(int nbins, 
+                                            int block_stride_x, int block_stride_y, 
+                                            int nblocks_win_x, int nblocks_win_y)
 {
     cnbins = nbins;
     cblock_stride_x = block_stride_x;
@@ -1559,53 +1698,32 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b
     int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
     cblock_hist_size = block_hist_size;
 
-    int block_hist_size_2up = power_2up(block_hist_size);
-    cblock_hist_size_2up = block_hist_size_2up;
-
     int descr_width = nblocks_win_x * block_hist_size;
     cdescr_width = descr_width;
+    cdescr_height = nblocks_win_y;
 
     int descr_size = descr_width * nblocks_win_y;
     cdescr_size = descr_size;
 }
 
-static inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
-static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName, 
-                                    size_t globalThreads[3], size_t localThreads[3], 
-                                    vector< pair<size_t, const void *> > &args)
-{
-    cl_kernel kernel = openCLGetKernelFromSource(clCxt, source, kernelName);
-    size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
-    openCLSafeCall(clReleaseKernel(kernel));
-    if (wave_size <= 16)
-    {
-        char build_options[64];
-        sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1");
-        openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-    }
-    else
-        openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y,
-        int height, int width, const cv::ocl::oclMat &grad,
-        const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists)
+void cv::ocl::device::hog::compute_hists(int nbins, 
+                                         int block_stride_x, int block_stride_y,
+                                         int height, int width, float sigma, 
+                                         const cv::ocl::oclMat &grad, 
+                                         const cv::ocl::oclMat &qangle, 
+                                         const cv::ocl::oclMat &gauss_w_lut, 
+                                         cv::ocl::oclMat &block_hists)
 {
     Context *clCxt = Context::getContext();
-    string kernelName = "compute_hists_kernel";
     vector< pair<size_t, const void *> > args;
+    string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" : 
+        "compute_hists_kernel";
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) 
+        / block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) 
+        / block_stride_y;
     int blocks_total = img_block_width * img_block_height;
-    int blocks_in_group = 4;
-    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
-    size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 };
 
     int grad_quadstep = grad.step >> 2;
     int qangle_step = qangle.step;
@@ -1613,6 +1731,11 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
     // Precompute gaussian spatial window parameter
     float scale = 1.f / (2.f * sigma * sigma);
 
+    int blocks_in_group = 4;
+    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
+    size_t globalThreads[3] = { 
+        divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 };
+
     int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
     int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
     int smem = (hists_size + final_hists_size) * blocks_in_group;
@@ -1628,62 +1751,120 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
     args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
+    if (kernelName.compare("compute_hists_lut_kernel") == 0)
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
+    else
+        args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( smem, (void *)NULL));
 
-    openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-        int height, int width, cv::ocl::oclMat &block_hists, float threshold)
+void cv::ocl::device::hog::normalize_hists(int nbins, 
+                                           int block_stride_x, int block_stride_y,
+                                           int height, int width, 
+                                           cv::ocl::oclMat &block_hists, 
+                                           float threshold)
 {
     Context *clCxt = Context::getContext();
-    string kernelName = "normalize_hists_kernel";
     vector< pair<size_t, const void *> > args;
+    string kernelName;
 
     int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
-    int nthreads = power_2up(block_hist_size);
-
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-    size_t globalThreads[3] = { img_block_width * nthreads, img_block_height, 1 };
-    size_t localThreads[3] = { nthreads, 1, 1  };
-
-    if ((nthreads < 32) || (nthreads > 512) )
-        cv::ocl::error("normalize_hists: histogram's size is too small or too big", __FILE__, __LINE__, "normalize_hists");
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) 
+        / block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) 
+        / block_stride_y;
+    int nthreads;
+    size_t globalThreads[3] = { 1, 1, 1  };
+    size_t localThreads[3] = { 1, 1, 1  };
+    
+    if ( nbins == 9 )
+    {
+        /* optimized for the case of 9 bins */
+        kernelName = "normalize_hists_36_kernel";
+        int blocks_in_group = NTHREADS / block_hist_size;
+        nthreads = blocks_in_group * block_hist_size;
+        int num_groups = divUp( img_block_width * img_block_height, blocks_in_group);
+        globalThreads[0] = nthreads * num_groups;
+        localThreads[0] = nthreads;
+    }
+    else
+    {
+        kernelName = "normalize_hists_kernel";
+        nthreads = power_2up(block_hist_size);
+        globalThreads[0] = img_block_width * nthreads;
+        globalThreads[1] = img_block_height;
+        localThreads[0] = nthreads;
+
+        if ((nthreads < 32) || (nthreads > 512) )
+            cv::ocl::error("normalize_hists: histogram's size is too small or too big", 
+                __FILE__, __LINE__, "normalize_hists");
+
+        args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
+    }
 
-    args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
     args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL));
 
-    openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+    if(hog_device_cpu)
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1, "-D CPU");
+    else
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
-        int block_stride_x, int win_stride_y, int win_stride_x, int height,
-        int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
-        float threshold, cv::ocl::oclMat &labels)
+void cv::ocl::device::hog::classify_hists(int win_height, int win_width, 
+                                          int block_stride_y, int block_stride_x, 
+                                          int win_stride_y, int win_stride_x, 
+                                          int height, int width, 
+                                          const cv::ocl::oclMat &block_hists, 
+                                          const cv::ocl::oclMat &coefs, 
+                                          float free_coef, float threshold, 
+                                          cv::ocl::oclMat &labels)
 {
     Context *clCxt = Context::getContext();
-    string kernelName = "classify_hists_kernel";
     vector< pair<size_t, const void *> > args;
 
+    int nthreads;
+    string kernelName;
+    switch (cdescr_width)
+    {
+    case 180:
+        nthreads = 180;
+        kernelName = "classify_hists_180_kernel";
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
+        break;
+    case 252:
+        nthreads = 256;
+        kernelName = "classify_hists_252_kernel";
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
+        break;
+    default:
+        nthreads = 256;
+        kernelName = "classify_hists_kernel";
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+    }
+
     int win_block_stride_x = win_stride_x / block_stride_x;
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-
-    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
-    size_t localThreads[3] = { NTHREADS, 1, 1 };
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+        block_stride_x;
 
+    size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
+    size_t localThreads[3] = { nthreads, 1, 1 };
     args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
     args.push_back( make_pair( sizeof(cl_int), (void *)&img_win_width));
     args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
     args.push_back( make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
@@ -1694,12 +1875,20 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
     args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data));
 
-    openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+    if(hog_device_cpu)
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1, "-D CPU");
+    else
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-        int win_stride_y, int win_stride_x, int height, int width,
-        const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, 
+                                                  int block_stride_y, int block_stride_x,
+                                                  int win_stride_y, int win_stride_x, 
+                                                  int height, int width,
+                                                  const cv::ocl::oclMat &block_hists, 
+                                                  cv::ocl::oclMat &descriptors)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "extract_descrs_by_rows_kernel";
@@ -1709,7 +1898,8 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+        block_stride_x;
     int descriptors_quadstep = descriptors.step >> 2;
 
     size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@@ -1725,12 +1915,16 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-        int win_stride_y, int win_stride_x, int height, int width,
-        const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, 
+                                                  int block_stride_y, int block_stride_x,
+                                                  int win_stride_y, int win_stride_x, 
+                                                  int height, int width,
+                                                  const cv::ocl::oclMat &block_hists, 
+                                                  cv::ocl::oclMat &descriptors)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "extract_descrs_by_cols_kernel";
@@ -1740,7 +1934,8 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+        block_stride_x;
     int descriptors_quadstep = descriptors.step >> 2;
 
     size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@@ -1757,11 +1952,16 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
-        float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, 
+                                                  const cv::ocl::oclMat &img,
+                                                  float angle_scale, 
+                                                  cv::ocl::oclMat &grad, 
+                                                  cv::ocl::oclMat &qangle, 
+                                                  bool correct_gamma)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "compute_gradients_8UC1_kernel";
@@ -1786,11 +1986,16 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
     args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
     args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
-        float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, 
+                                                  const cv::ocl::oclMat &img,
+                                                  float angle_scale, 
+                                                  cv::ocl::oclMat &grad, 
+                                                  cv::ocl::oclMat &qangle, 
+                                                  bool correct_gamma)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "compute_gradients_8UC4_kernel";
@@ -1816,39 +2021,6 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
     args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
     args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
-{
-    CV_Assert( (src.channels() == dst.channels()) );
-    Context *clCxt = Context::getContext();
-
-    string kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel";
-    size_t blkSizeX = 16, blkSizeY = 16;
-    size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX;
-    size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY;
-    size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
-    size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
-
-    float ifx = (float)src.cols / sz.width;
-    float ify = (float)src.rows / sz.height;
-    int src_step = static_cast<int>(src.step);
-    int dst_step = static_cast<int>(dst.step);
-
-    vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&sz.height));
-    args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
-    args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
-
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
-}
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
+}
\ No newline at end of file
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
index 8852fac..05d5383 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -43,7 +43,6 @@
 //
 //M*/
 
-
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
 #define CELLS_PER_BLOCK_X 2
@@ -54,6 +53,100 @@
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
+// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
+__kernel void compute_hists_lut_kernel(
+    const int cblock_stride_x, const int cblock_stride_y,
+    const int cnbins, const int cblock_hist_size, const int img_block_width, 
+    const int blocks_in_group, const int blocks_total,
+    const int grad_quadstep, const int qangle_step,
+    __global const float* grad, __global const uchar* qangle,
+    __global const float* gauss_w_lut,
+    __global float* block_hists, __local float* smem)
+{
+    const int lx = get_local_id(0);
+    const int lp = lx / 24; /* local group id */
+    const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
+    const int gidY = gid / img_block_width;
+    const int gidX = gid - gidY * img_block_width;
+
+    const int lidX = lx - lp * 24;
+    const int lidY = get_local_id(1);
+
+    const int cell_x = lidX / 12;
+    const int cell_y = lidY;
+    const int cell_thread_x = lidX - cell_x * 12;
+
+    __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * 
+        CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
+    __local float* final_hist = hists + cnbins * 
+        (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
+
+    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
+    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
+
+    __global const float* grad_ptr = (gid < blocks_total) ? 
+        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
+    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
+
+    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + 
+        cell_thread_x;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+        hist[bin_id * 48] = 0.f;
+
+    const int dist_x = -4 + cell_thread_x - 4 * cell_x;
+    const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+    const int dist_y_begin = -4 - 4 * lidY;
+    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+    {
+        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
+        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+
+        grad_ptr += grad_quadstep;
+        qangle_ptr += qangle_step;
+
+        int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+
+        int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
+        float gaussian = gauss_w_lut[idx];
+        idx = (dist_y + 8) * 16 + (dist_x + 8);
+        float interp_weight = gauss_w_lut[256+idx];
+
+        hist[bin.x * 48] += gaussian * interp_weight * vote.x;
+        hist[bin.y * 48] += gaussian * interp_weight * vote.y;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* hist_ = hist;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
+    {
+        if (cell_thread_x < 6)
+            hist_[0] += hist_[6];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (cell_thread_x < 3)
+            hist_[0] += hist_[3];
+#ifdef CPU
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+        if (cell_thread_x == 0)
+            final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
+                hist_[0] + hist_[1] + hist_[2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
+    if ((tid < cblock_hist_size) && (gid < blocks_total))
+    {
+        __global float* block_hist = block_hists + 
+            (gidY * img_block_width + gidX) * cblock_hist_size;
+        block_hist[tid] = final_hist[tid];
+    }
+}
+
+//----------------------------------------------------------------------------
+// Histogram computation
+// 12 threads for a cell, 12x4 threads per block
 __kernel void compute_hists_kernel(
     const int cblock_stride_x, const int cblock_stride_y,
     const int cnbins, const int cblock_hist_size, const int img_block_width, 
@@ -125,16 +218,14 @@ __kernel void compute_hists_kernel(
         barrier(CLK_LOCAL_MEM_FENCE);
         if (cell_thread_x < 3)
             hist_[0] += hist_[3];
-#ifdef WAVE_SIZE_1
+#ifdef CPU
         barrier(CLK_LOCAL_MEM_FENCE);
 #endif
         if (cell_thread_x == 0)
             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
                 hist_[0] + hist_[1] + hist_[2];
     }
-#ifdef WAVE_SIZE_1
     barrier(CLK_LOCAL_MEM_FENCE);
-#endif
 
     int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
     if ((tid < cblock_hist_size) && (gid < blocks_total))
@@ -147,82 +238,107 @@ __kernel void compute_hists_kernel(
 
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
+//  optimized for the case of 9 bins
+__kernel void normalize_hists_36_kernel(__global float* block_hists, 
+                                        const float threshold, __local float *squares)
+{
+    const int tid = get_local_id(0);
+    const int gid = get_global_id(0);
+    const int bid = tid / 36;      /* block-hist id, (0 - 6) */
+    const int boffset = bid * 36;  /* block-hist offset in the work-group */
+    const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
+
+    float elem = block_hists[gid];
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local float* smem = squares + boffset;
+    float sum = smem[hid];
+    if (hid < 18)
+        smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    elem = elem / (sqrt(sum) + 3.6f);
+    elem = min(elem, threshold);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sum = smem[hid];
+    if (hid < 18)
+      smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
+}
+
+//-------------------------------------------------------------
+//  Normalization of histograms via L2Hys_norm
 //
 float reduce_smem(volatile __local float* smem, int size)
 {
     unsigned int tid = get_local_id(0);
     float sum = smem[tid];
 
-    if (size >= 512)
-    {
-        if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (size >= 256)
-    {
-        if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (size >= 128)
-    {
-        if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
+    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#ifdef CPU
+    if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }	
+    if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }		
+    if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#else
     if (tid < 32)
     {
         if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-#endif
         if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-#endif
         if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
-    {
-#endif
         if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
-    {
-#endif
         if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
-#endif
         if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
     }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    sum = smem[0];
+#endif
 
     return sum;
 }
 
-__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
-                                     __global float* block_hists, const float threshold, __local float *squares)
+__kernel void normalize_hists_kernel(
+    const int nthreads, const int block_hist_size, const int img_block_width,
+    __global float* block_hists, const float threshold, __local float *squares)
 {
     const int tid = get_local_id(0);
     const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
-    __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
+    __global float* hist = block_hists + (gidY * img_block_width + gidX) * 
+        block_hist_size + tid;
 
     float elem = 0.f;
     if (tid < block_hist_size)
@@ -249,100 +365,226 @@ __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_si
 
 //---------------------------------------------------------------------
 //  Linear SVM based classification
-//
-__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
-                                    const int img_win_width, const int img_block_width,
-                                    const int win_block_stride_x, const int win_block_stride_y,
-                                    __global const float * block_hists, __global const float* coefs,
-                                    float free_coef, float threshold, __global uchar* labels)
+//  48x96 window, 9 bins and default parameters
+//  180 threads, each thread corresponds to a bin in a row
+__kernel void classify_hists_180_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
 {
     const int tid = get_local_id(0);
     const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     float product = 0.f;
-    for (int i = tid; i < cdescr_size; i += NTHREADS)
+
+    for (int i = 0; i < cdescr_height; i++)
     {
-        int offset_y = i / cdescr_width;
-        int offset_x = i - offset_y * cdescr_width;
-        product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+        product += coefs[i * cdescr_width + tid] * 
+            hist[i * img_block_width * cblock_hist_size + tid];
     }
 
-    __local float products[NTHREADS];
+    __local float products[180];
 
     products[tid] = product;
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    if (tid < 90) products[tid] = product = product + products[tid + 90];
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    if (tid < 45) products[tid] = product = product + products[tid + 45];
     barrier(CLK_LOCAL_MEM_FENCE);
 
     volatile __local float* smem = products;
-    if (tid < 32)
+#ifdef CPU
+    if (tid < 13) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 13)
     {
         smem[tid] = product = product + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
     if (tid < 16)
     {
-#endif
         smem[tid] = product = product + smem[tid + 16];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 8];
-#ifdef WAVE_SIZE_1
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
-    {
 #endif
-        smem[tid] = product = product + smem[tid + 4];
-#ifdef WAVE_SIZE_1
+
+    if (tid == 0){
+		product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  64x128 window, 9 bins and default parameters
+//  256 threads, 252 of them are used
+__kernel void classify_hists_252_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    if (tid < cdescr_width)
+    {
+        for (int i = 0; i < cdescr_height; i++)
+            product += coefs[i * cdescr_width + tid] * 
+                hist[i * img_block_width * cblock_hist_size + tid];
     }
+
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
-    {
-#endif
+
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+	volatile __local float* smem = products;
+#ifdef CPU
+	if(tid<32) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 32)
+    {      
+        smem[tid] = product = product + smem[tid + 32];
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
         smem[tid] = product = product + smem[tid + 2];
-#ifdef WAVE_SIZE_1
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
 #endif
-        smem[tid] = product = product + smem[tid + 1];
+    if (tid == 0){
+		product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  256 threads
+__kernel void classify_hists_kernel(
+    const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        product += coefs[i] * 
+            hist[offset_y * img_block_width * cblock_hist_size + offset_x];
     }
 
-    if (tid == 0)
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+	volatile __local float* smem = products;
+#ifdef CPU
+	if(tid<32) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 32)
+    {       
+        smem[tid] = product = product + smem[tid + 32];
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+    if (tid == 0){
+		smem[tid] = product = product + smem[tid + 1];
         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
 }
 
 //----------------------------------------------------------------------------
 // Extract descriptors
 
-__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
-        const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-        __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_rows_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep, 
+    const int cdescr_size, const int cdescr_width, const int img_block_width, 
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float* block_hists, __global float* descriptors)
 {
     int tid = get_local_id(0);
     int gidX = get_group_id(0);
     int gidY = get_group_id(1);
 
     // Get left top corner of the window in src
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     // Get left top corner of the window in dst
-    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+    __global float* descriptor = descriptors + 
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 
     // Copy elements from src to dst
     for (int i = tid; i < cdescr_size; i += NTHREADS)
@@ -353,19 +595,23 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
     }
 }
 
-__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
-        const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
-        const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_cols_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
+    const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, 
+    const int win_block_stride_x, const int win_block_stride_y, 
+    __global const float* block_hists, __global float* descriptors)
 {
     int tid = get_local_id(0);
     int gidX = get_group_id(0);
     int gidY = get_group_id(1);
 
     // Get left top corner of the window in src
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists +  (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     // Get left top corner of the window in dst
-    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+    __global float* descriptor = descriptors + 
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 
     // Copy elements from src to dst
     for (int i = tid; i < cdescr_size; i += NTHREADS)
@@ -376,16 +622,19 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
         int y = block_idx / cnblocks_win_x;
         int x = block_idx - y * cnblocks_win_x;
 
-        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = 
+            hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
     }
 }
 
 //----------------------------------------------------------------------------
 // Gradients computation
 
-__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
-        const __global uchar4 * img, __global float * grad, __global uchar * qangle,
-        const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC4_kernel(
+    const int height, const int width, 
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
@@ -426,8 +675,10 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
     barrier(CLK_LOCAL_MEM_FENCE);
     if (x < width)
     {
-        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
-        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], 
+            sh_row[tid + 2 * (NTHREADS + 2)]);
+        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], 
+            sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
 
         float3 dx;
         if (correct_gamma == 1)
@@ -482,9 +733,11 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
     }
 }
 
-__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
-        __global const uchar * img, __global float * grad, __global uchar * qangle,
-        const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC1_kernel(
+    const int height, const int width, 
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
@@ -539,43 +792,4 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
         grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
         grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
     }
-}
-
-//----------------------------------------------------------------------------
-// Resize
-
-__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step,
-                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    int sx = (int)floor(dx*ifx+0.5f);
-    int sy = (int)floor(dy*ify+0.5f);
-    sx = min(sx, src_cols-1);
-    sy = min(sy, src_rows-1);
-    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
-    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
-
-    if(dx<dst_cols && dy<dst_rows)
-        dst[dpos] = src[spos];
-}
-
-__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step,
-                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    int sx = (int)floor(dx*ifx+0.5f);
-    int sy = (int)floor(dy*ify+0.5f);
-    sx = min(sx, src_cols-1);
-    sy = min(sy, src_rows-1);
-    int dpos = dst_offset + dy * dst_step + dx;
-    int spos = src_offset + sy * src_step + sx;
-
-    if(dx<dst_cols && dy<dst_rows)
-        dst[dpos] = src[spos];
 }
\ No newline at end of file
-- 
2.7.4