// Third party copyrights are property of their respective owners.
//
// @Authors
-// Wenju He, wenju@multicorewareinc.com
+// Wenju He, wenju@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
using namespace cv::ocl;
using namespace std;
-
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2
#define NTHREADS 256
+static oclMat gauss_w_lut;
+static bool hog_device_cpu;
+/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
+static const float gaussian_interp_lut[] =
+{
+ /* gaussian lut */
+ 0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f,
+ 0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f,
+ 0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f,
+ 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f,
+ 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f,
+ 0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
+ 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
+ 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
+ 0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f,
+ 0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f,
+ 0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f,
+ 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f,
+ 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f,
+ 0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
+ 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
+ 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
+ 0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f,
+ 0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f,
+ 0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f,
+ 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f,
+ 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f,
+ 0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f,
+ 0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f,
+ 0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f,
+ 0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f,
+ 0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f,
+ 0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f,
+ 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f,
+ 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f,
+ 0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
+ 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
+ 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
+ 0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f,
+ 0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f,
+ 0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f,
+ 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f,
+ 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f,
+ 0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
+ 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
+ 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
+ 0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f,
+ 0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f,
+ 0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f,
+ /* interp_weight lut */
+ 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
+ 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
+ 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f,
+ 0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f,
+ 0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f,
+ 0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f,
+ 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f,
+ 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f,
+ 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
+ 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
+ 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f,
+ 0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f,
+ 0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f,
+ 0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f,
+ 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f,
+ 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f,
+ 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
+ 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
+ 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f,
+ 0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f,
+ 0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f,
+ 0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f,
+ 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f,
+ 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f,
+ 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
+ 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
+ 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f,
+ 0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f,
+ 0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f,
+ 0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f,
+ 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f,
+ 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f,
+ 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
+ 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
+ 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f,
+ 0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f,
+ 0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f,
+ 0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f,
+ 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f,
+ 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f,
+ 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
+ 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
+ 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
+};
+
namespace cv
{
namespace ocl
int cnblocks_win_x;
int cnblocks_win_y;
int cblock_hist_size;
- int cblock_hist_size_2up;
int cdescr_size;
int cdescr_width;
+ int cdescr_height;
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y);
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
- int height, int width, const cv::ocl::oclMat &grad,
- const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists);
+ int height, int width, float sigma, const cv::ocl::oclMat &grad,
+ const cv::ocl::oclMat &qangle,
+ const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
- int height, int width, cv::ocl::oclMat &block_hists, float threshold);
+ int height, int width, cv::ocl::oclMat &block_hists,
+ float threshold);
void classify_hists(int win_height, int win_width, int block_stride_y,
- int block_stride_x, int win_stride_y, int win_stride_x, int height,
- int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
+ int block_stride_x, int win_stride_y, int win_stride_x,
+ int height, int width, const cv::ocl::oclMat &block_hists,
+ const cv::ocl::oclMat &coefs, float free_coef,
float threshold, cv::ocl::oclMat &labels);
- void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
- int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y,
+ int block_stride_x, int win_stride_y, int win_stride_x,
+ int height, int width, const cv::ocl::oclMat &block_hists,
cv::ocl::oclMat &descriptors);
- void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
- int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y,
+ int block_stride_x, int win_stride_y, int win_stride_x,
+ int height, int width, const cv::ocl::oclMat &block_hists,
cv::ocl::oclMat &descriptors);
void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
- float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
+ float angle_scale, cv::ocl::oclMat &grad,
+ cv::ocl::oclMat &qangle, bool correct_gamma);
void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
- float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
-
- void resize( const oclMat &src, oclMat &dst, const Size sz);
+ float angle_scale, cv::ocl::oclMat &grad,
+ cv::ocl::oclMat &qangle, bool correct_gamma);
}
}
}
using namespace ::cv::ocl::device;
-cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
- int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
+static inline int divUp(int total, int grain)
+{
+ return (total + grain - 1) / grain;
+}
+
+cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
+ Size cell_size_, int nbins_, double win_sigma_,
+ double threshold_L2hys_, bool gamma_correction_, int nlevels_)
: win_size(win_size_),
block_size(block_size_),
block_stride(block_stride_),
CV_Assert((win_size.width - block_size.width ) % block_stride.width == 0 &&
(win_size.height - block_size.height) % block_stride.height == 0);
- CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
+ CV_Assert(block_size.width % cell_size.width == 0 &&
+ block_size.height % cell_size.height == 0);
CV_Assert(block_stride == cell_size);
CV_Assert(cell_size == Size(8, 8));
- Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+ Size cells_per_block(block_size.width / cell_size.width,
+ block_size.height / cell_size.height);
CV_Assert(cells_per_block == Size(2, 2));
cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
- hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
+ hog::set_up_constants(nbins, block_stride.width, block_stride.height,
+ blocks_per_win.width, blocks_per_win.height);
effect_size = Size(0, 0);
+
+ if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
+ hog_device_cpu = true;
+ else
+ hog_device_cpu = false;
}
size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const
{
- Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+ Size cells_per_block = Size(block_size.width / cell_size.width,
+ block_size.height / cell_size.height);
return (size_t)(nbins * cells_per_block.area());
}
{
size_t detector_size = detector.rows * detector.cols;
size_t descriptor_size = getDescriptorSize();
- return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
+ return detector_size == 0 || detector_size == descriptor_size ||
+ detector_size == descriptor_size + 1;
}
void cv::ocl::HOGDescriptor::setSVMDetector(const vector<float> &_detector)
const size_t block_hist_size = getBlockHistogramSize();
const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
- block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
+ block_hists.create(1,
+ static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
labels.create(1, wins_per_img.area(), CV_8U);
+
+ vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut +
+ sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
+ Mat m_lut(v_lut);
+ gauss_w_lut.upload(m_lut.reshape(1,1));
}
void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
switch (img.type())
{
case CV_8UC1:
- hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+ hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img,
+ angleScale, grad, qangle, gamma_correction);
break;
case CV_8UC4:
- hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+ hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img,
+ angleScale, grad, qangle, gamma_correction);
break;
}
}
+
void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
{
- computeGradient(img, grad, qangle);
+ computeGradient(img, this->grad, this->qangle);
- hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
- grad, qangle, (float)getWinSigma(), block_hists);
+ hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
+ effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
- hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
- block_hists, (float)threshold_L2hys);
+ hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
+ effect_size.width, block_hists, (float)threshold_L2hys);
}
-void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, oclMat &descriptors, int descr_format)
+void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
+ oclMat &descriptors, int descr_format)
{
- CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+ CV_Assert(win_stride.width % block_stride.width == 0 &&
+ win_stride.height % block_stride.height == 0);
init_buffer(img, win_stride);
Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
Size wins_per_img = numPartsWithin(effect_size, win_size, win_stride);
- descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
+ descriptors.create(wins_per_img.area(),
+ static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
switch (descr_format)
{
case DESCR_FORMAT_ROW_BY_ROW:
- hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
- win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+ hog::extract_descrs_by_rows(win_size.height, win_size.width,
+ block_stride.height, block_stride.width, win_stride.height, win_stride.width,
+ effect_size.height, effect_size.width, block_hists, descriptors);
break;
case DESCR_FORMAT_COL_BY_COL:
- hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
- win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+ hog::extract_descrs_by_cols(win_size.height, win_size.width,
+ block_stride.height, block_stride.width, win_stride.height, win_stride.width,
+ effect_size.height, effect_size.width, block_hists, descriptors);
break;
default:
CV_Error(CV_StsBadArg, "Unknown descriptor format");
}
-void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, double hit_threshold, Size win_stride, Size padding)
+void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits,
+ double hit_threshold, Size win_stride, Size padding)
{
CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
CV_Assert(padding == Size(0, 0));
if (win_stride == Size())
win_stride = block_stride;
else
- CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+ CV_Assert(win_stride.width % block_stride.width == 0 &&
+ win_stride.height % block_stride.height == 0);
init_buffer(img, win_stride);
computeBlockHistograms(img);
- hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
- win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists,
- detector, (float)free_coef, (float)hit_threshold, labels);
+ hog::classify_hists(win_size.height, win_size.width, block_stride.height,
+ block_stride.width, win_stride.height, win_stride.width,
+ effect_size.height, effect_size.width, block_hists, detector,
+ (float)free_coef, (float)hit_threshold, labels);
labels.download(labels_host);
unsigned char *vec = labels_host.ptr();
-void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations, double hit_threshold,
- Size win_stride, Size padding, double scale0, int group_threshold)
+void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations,
+ double hit_threshold, Size win_stride, Size padding,
+ double scale0, int group_threshold)
{
CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
CV_Assert(scale0 > 1);
if (win_stride == Size())
win_stride = block_stride;
else
- CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+ CV_Assert(win_stride.width % block_stride.width == 0 &&
+ win_stride.height % block_stride.height == 0);
init_buffer(img, win_stride);
image_scale.create(img.size(), img.type());
}
else
{
- hog::resize( img, image_scale, effect_size);
+ resize(img, image_scale, effect_size);
detect(image_scale, locations, hit_threshold, win_stride, padding);
}
- Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
+ Size scaled_win_size(cvRound(win_size.width * scale),
+ cvRound(win_size.height * scale));
for (size_t j = 0; j < locations.size(); j++)
- all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size));
+ all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale,
+ scaled_win_size));
}
found_locations.assign(all_candidates.begin(), all_candidates.end());
- groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
+ groupRectangles(found_locations, group_threshold, 0.2);
}
int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
return (size - part_size + stride) / stride;
}
-cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
+cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size,
+ cv::Size stride)
{
- return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
+ return Size(numPartsWithin(size.width, part_size.width, stride.width),
+ numPartsWithin(size.height, part_size.height, stride.height));
}
std::vector<float> cv::ocl::HOGDescriptor::getDefaultPeopleDetector()
return -1; // Input is too big
}
-void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int block_stride_y,
- int nblocks_win_x, int nblocks_win_y)
+void cv::ocl::device::hog::set_up_constants(int nbins,
+ int block_stride_x, int block_stride_y,
+ int nblocks_win_x, int nblocks_win_y)
{
cnbins = nbins;
cblock_stride_x = block_stride_x;
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
cblock_hist_size = block_hist_size;
- int block_hist_size_2up = power_2up(block_hist_size);
- cblock_hist_size_2up = block_hist_size_2up;
-
int descr_width = nblocks_win_x * block_hist_size;
cdescr_width = descr_width;
+ cdescr_height = nblocks_win_y;
int descr_size = descr_width * nblocks_win_y;
cdescr_size = descr_size;
}
-static inline int divUp(int total, int grain)
-{
- return (total + grain - 1) / grain;
-}
-
-static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName,
- size_t globalThreads[3], size_t localThreads[3],
- vector< pair<size_t, const void *> > &args)
-{
- cl_kernel kernel = openCLGetKernelFromSource(clCxt, source, kernelName);
- size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
- openCLSafeCall(clReleaseKernel(kernel));
- if (wave_size <= 16)
- {
- char build_options[64];
- sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1");
- openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
- }
- else
- openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y,
- int height, int width, const cv::ocl::oclMat &grad,
- const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists)
+void cv::ocl::device::hog::compute_hists(int nbins,
+ int block_stride_x, int block_stride_y,
+ int height, int width, float sigma,
+ const cv::ocl::oclMat &grad,
+ const cv::ocl::oclMat &qangle,
+ const cv::ocl::oclMat &gauss_w_lut,
+ cv::ocl::oclMat &block_hists)
{
Context *clCxt = Context::getContext();
- string kernelName = "compute_hists_kernel";
vector< pair<size_t, const void *> > args;
+ string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" :
+ "compute_hists_kernel";
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
- int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
+ / block_stride_x;
+ int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
+ / block_stride_y;
int blocks_total = img_block_width * img_block_height;
- int blocks_in_group = 4;
- size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
- size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 };
int grad_quadstep = grad.step >> 2;
int qangle_step = qangle.step;
// Precompute gaussian spatial window parameter
float scale = 1.f / (2.f * sigma * sigma);
+ int blocks_in_group = 4;
+ size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
+ size_t globalThreads[3] = {
+ divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 };
+
int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
int smem = (hists_size + final_hists_size) * blocks_in_group;
args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
- args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
+ if (kernelName.compare("compute_hists_lut_kernel") == 0)
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
+ else
+ args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( smem, (void *)NULL));
- openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1);
}
-void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
- int height, int width, cv::ocl::oclMat &block_hists, float threshold)
+void cv::ocl::device::hog::normalize_hists(int nbins,
+ int block_stride_x, int block_stride_y,
+ int height, int width,
+ cv::ocl::oclMat &block_hists,
+ float threshold)
{
Context *clCxt = Context::getContext();
- string kernelName = "normalize_hists_kernel";
vector< pair<size_t, const void *> > args;
+ string kernelName;
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
- int nthreads = power_2up(block_hist_size);
-
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
- int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
- size_t globalThreads[3] = { img_block_width * nthreads, img_block_height, 1 };
- size_t localThreads[3] = { nthreads, 1, 1 };
-
- if ((nthreads < 32) || (nthreads > 512) )
- cv::ocl::error("normalize_hists: histogram's size is too small or too big", __FILE__, __LINE__, "normalize_hists");
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
+ / block_stride_x;
+ int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
+ / block_stride_y;
+ int nthreads;
+ size_t globalThreads[3] = { 1, 1, 1 };
+ size_t localThreads[3] = { 1, 1, 1 };
+
+ if ( nbins == 9 )
+ {
+ /* optimized for the case of 9 bins */
+ kernelName = "normalize_hists_36_kernel";
+ int blocks_in_group = NTHREADS / block_hist_size;
+ nthreads = blocks_in_group * block_hist_size;
+ int num_groups = divUp( img_block_width * img_block_height, blocks_in_group);
+ globalThreads[0] = nthreads * num_groups;
+ localThreads[0] = nthreads;
+ }
+ else
+ {
+ kernelName = "normalize_hists_kernel";
+ nthreads = power_2up(block_hist_size);
+ globalThreads[0] = img_block_width * nthreads;
+ globalThreads[1] = img_block_height;
+ localThreads[0] = nthreads;
+
+ if ((nthreads < 32) || (nthreads > 512) )
+ cv::ocl::error("normalize_hists: histogram's size is too small or too big",
+ __FILE__, __LINE__, "normalize_hists");
+
+ args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
+ }
- args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
- args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
- args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL));
- openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+ if(hog_device_cpu)
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1, "-D CPU");
+ else
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1);
}
-void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
- int block_stride_x, int win_stride_y, int win_stride_x, int height,
- int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
- float threshold, cv::ocl::oclMat &labels)
+void cv::ocl::device::hog::classify_hists(int win_height, int win_width,
+ int block_stride_y, int block_stride_x,
+ int win_stride_y, int win_stride_x,
+ int height, int width,
+ const cv::ocl::oclMat &block_hists,
+ const cv::ocl::oclMat &coefs,
+ float free_coef, float threshold,
+ cv::ocl::oclMat &labels)
{
Context *clCxt = Context::getContext();
- string kernelName = "classify_hists_kernel";
vector< pair<size_t, const void *> > args;
+ int nthreads;
+ string kernelName;
+ switch (cdescr_width)
+ {
+ case 180:
+ nthreads = 180;
+ kernelName = "classify_hists_180_kernel";
+ args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
+ break;
+ case 252:
+ nthreads = 256;
+ kernelName = "classify_hists_252_kernel";
+ args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
+ break;
+ default:
+ nthreads = 256;
+ kernelName = "classify_hists_kernel";
+ args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+ }
+
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-
- size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
- size_t localThreads[3] = { NTHREADS, 1, 1 };
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+ block_stride_x;
+ size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
+ size_t localThreads[3] = { nthreads, 1, 1 };
args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
- args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
- args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_win_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data));
- openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+ if(hog_device_cpu)
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1, "-D CPU");
+ else
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1);
}
-void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
- int win_stride_y, int win_stride_x, int height, int width,
- const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
+ int block_stride_y, int block_stride_x,
+ int win_stride_y, int win_stride_x,
+ int height, int width,
+ const cv::ocl::oclMat &block_hists,
+ cv::ocl::oclMat &descriptors)
{
Context *clCxt = Context::getContext();
string kernelName = "extract_descrs_by_rows_kernel";
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+ block_stride_x;
int descriptors_quadstep = descriptors.step >> 2;
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
- openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1);
}
-void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
- int win_stride_y, int win_stride_x, int height, int width,
- const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
+ int block_stride_y, int block_stride_x,
+ int win_stride_y, int win_stride_x,
+ int height, int width,
+ const cv::ocl::oclMat &block_hists,
+ cv::ocl::oclMat &descriptors)
{
Context *clCxt = Context::getContext();
string kernelName = "extract_descrs_by_cols_kernel";
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
- int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+ int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+ block_stride_x;
int descriptors_quadstep = descriptors.step >> 2;
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
- openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1);
}
-void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
- float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
+ const cv::ocl::oclMat &img,
+ float angle_scale,
+ cv::ocl::oclMat &grad,
+ cv::ocl::oclMat &qangle,
+ bool correct_gamma)
{
Context *clCxt = Context::getContext();
string kernelName = "compute_gradients_8UC1_kernel";
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
- openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1);
}
-void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
- float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
+ const cv::ocl::oclMat &img,
+ float angle_scale,
+ cv::ocl::oclMat &grad,
+ cv::ocl::oclMat &qangle,
+ bool correct_gamma)
{
Context *clCxt = Context::getContext();
string kernelName = "compute_gradients_8UC4_kernel";
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
- openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
-{
- CV_Assert( (src.channels() == dst.channels()) );
- Context *clCxt = Context::getContext();
-
- string kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel";
- size_t blkSizeX = 16, blkSizeY = 16;
- size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX;
- size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY;
- size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
- size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
-
- float ifx = (float)src.cols / sz.width;
- float ify = (float)src.rows / sz.height;
- int src_step = static_cast<int>(src.step);
- int dst_step = static_cast<int>(dst.step);
-
- vector< pair<size_t, const void *> > args;
- args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
- args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
- args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
- args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
- args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
- args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
- args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
- args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
- args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width));
- args.push_back( make_pair(sizeof(cl_int), (void *)&sz.height));
- args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
- args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
-
- openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
-}
+ openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
+ localThreads, args, -1, -1);
+}
\ No newline at end of file
//
//M*/
-
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
+// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
+__kernel void compute_hists_lut_kernel(
+ const int cblock_stride_x, const int cblock_stride_y,
+ const int cnbins, const int cblock_hist_size, const int img_block_width,
+ const int blocks_in_group, const int blocks_total,
+ const int grad_quadstep, const int qangle_step,
+ __global const float* grad, __global const uchar* qangle,
+ __global const float* gauss_w_lut,
+ __global float* block_hists, __local float* smem)
+{
+ const int lx = get_local_id(0);
+ const int lp = lx / 24; /* local group id */
+ const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
+ const int gidY = gid / img_block_width;
+ const int gidX = gid - gidY * img_block_width;
+
+ const int lidX = lx - lp * 24;
+ const int lidY = get_local_id(1);
+
+ const int cell_x = lidX / 12;
+ const int cell_y = lidY;
+ const int cell_thread_x = lidX - cell_x * 12;
+
+ __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
+ CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
+ __local float* final_hist = hists + cnbins *
+ (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
+
+ const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
+ const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
+
+ __global const float* grad_ptr = (gid < blocks_total) ?
+ grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
+ __global const uchar* qangle_ptr = (gid < blocks_total) ?
+ qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
+
+ __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
+ cell_thread_x;
+ for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+ hist[bin_id * 48] = 0.f;
+
+ const int dist_x = -4 + cell_thread_x - 4 * cell_x;
+ const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+ const int dist_y_begin = -4 - 4 * lidY;
+ for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+ {
+ float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
+ uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+
+ grad_ptr += grad_quadstep;
+ qangle_ptr += qangle_step;
+
+ int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+
+ int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
+ float gaussian = gauss_w_lut[idx];
+ idx = (dist_y + 8) * 16 + (dist_x + 8);
+ float interp_weight = gauss_w_lut[256+idx];
+
+ hist[bin.x * 48] += gaussian * interp_weight * vote.x;
+ hist[bin.y * 48] += gaussian * interp_weight * vote.y;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ volatile __local float* hist_ = hist;
+ for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
+ {
+ if (cell_thread_x < 6)
+ hist_[0] += hist_[6];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (cell_thread_x < 3)
+ hist_[0] += hist_[3];
+#ifdef CPU
+ barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+ if (cell_thread_x == 0)
+ final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
+ hist_[0] + hist_[1] + hist_[2];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
+ if ((tid < cblock_hist_size) && (gid < blocks_total))
+ {
+ __global float* block_hist = block_hists +
+ (gidY * img_block_width + gidX) * cblock_hist_size;
+ block_hist[tid] = final_hist[tid];
+ }
+}
+
+//----------------------------------------------------------------------------
+// Histogram computation
+// 12 threads for a cell, 12x4 threads per block
__kernel void compute_hists_kernel(
const int cblock_stride_x, const int cblock_stride_y,
const int cnbins, const int cblock_hist_size, const int img_block_width,
barrier(CLK_LOCAL_MEM_FENCE);
if (cell_thread_x < 3)
hist_[0] += hist_[3];
-#ifdef WAVE_SIZE_1
+#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
hist_[0] + hist_[1] + hist_[2];
}
-#ifdef WAVE_SIZE_1
barrier(CLK_LOCAL_MEM_FENCE);
-#endif
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
if ((tid < cblock_hist_size) && (gid < blocks_total))
//-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm
+// optimized for the case of 9 bins
+__kernel void normalize_hists_36_kernel(__global float* block_hists,
+ const float threshold, __local float *squares)
+{
+ const int tid = get_local_id(0);
+ const int gid = get_global_id(0);
+ const int bid = tid / 36; /* block-hist id, (0 - 6) */
+ const int boffset = bid * 36; /* block-hist offset in the work-group */
+ const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
+
+ float elem = block_hists[gid];
+ squares[tid] = elem * elem;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ __local float* smem = squares + boffset;
+ float sum = smem[hid];
+ if (hid < 18)
+ smem[hid] = sum = sum + smem[hid + 18];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (hid < 9)
+ smem[hid] = sum = sum + smem[hid + 9];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (hid < 4)
+ smem[hid] = sum + smem[hid + 4];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+ elem = elem / (sqrt(sum) + 3.6f);
+ elem = min(elem, threshold);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ squares[tid] = elem * elem;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ sum = smem[hid];
+ if (hid < 18)
+ smem[hid] = sum = sum + smem[hid + 18];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (hid < 9)
+ smem[hid] = sum = sum + smem[hid + 9];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (hid < 4)
+ smem[hid] = sum + smem[hid + 4];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+ block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
+}
+
+//-------------------------------------------------------------
+// Normalization of histograms via L2Hys_norm
//
float reduce_smem(volatile __local float* smem, int size)
{
unsigned int tid = get_local_id(0);
float sum = smem[tid];
- if (size >= 512)
- {
- if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if (size >= 256)
- {
- if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if (size >= 128)
- {
- if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
+ if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+ if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+ if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+#ifdef CPU
+ if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+ if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+ if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+ if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+ if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+ if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
+ barrier(CLK_LOCAL_MEM_FENCE); }
+#else
if (tid < 32)
{
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 16)
- {
-#endif
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-#ifdef WAVE_SIZE_1
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 8)
- {
-#endif
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-#ifdef WAVE_SIZE_1
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 4)
- {
-#endif
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-#ifdef WAVE_SIZE_1
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 2)
- {
-#endif
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-#ifdef WAVE_SIZE_1
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 1)
- {
-#endif
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
}
-
- barrier(CLK_LOCAL_MEM_FENCE);
- sum = smem[0];
+#endif
return sum;
}
-__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
- __global float* block_hists, const float threshold, __local float *squares)
+__kernel void normalize_hists_kernel(
+ const int nthreads, const int block_hist_size, const int img_block_width,
+ __global float* block_hists, const float threshold, __local float *squares)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
- __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
+ __global float* hist = block_hists + (gidY * img_block_width + gidX) *
+ block_hist_size + tid;
float elem = 0.f;
if (tid < block_hist_size)
//---------------------------------------------------------------------
// Linear SVM based classification
-//
-__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
- const int img_win_width, const int img_block_width,
- const int win_block_stride_x, const int win_block_stride_y,
- __global const float * block_hists, __global const float* coefs,
- float free_coef, float threshold, __global uchar* labels)
+// 48x96 window, 9 bins and default parameters
+// 180 threads, each thread corresponds to a bin in a row
+__kernel void classify_hists_180_kernel(
+ const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+ const int img_win_width, const int img_block_width,
+ const int win_block_stride_x, const int win_block_stride_y,
+ __global const float * block_hists, __global const float* coefs,
+ float free_coef, float threshold, __global uchar* labels)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
- __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+ __global const float* hist = block_hists + (gidY * win_block_stride_y *
+ img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
float product = 0.f;
- for (int i = tid; i < cdescr_size; i += NTHREADS)
+
+ for (int i = 0; i < cdescr_height; i++)
{
- int offset_y = i / cdescr_width;
- int offset_x = i - offset_y * cdescr_width;
- product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+ product += coefs[i * cdescr_width + tid] *
+ hist[i * img_block_width * cblock_hist_size + tid];
}
- __local float products[NTHREADS];
+ __local float products[180];
products[tid] = product;
barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 128) products[tid] = product = product + products[tid + 128];
+ if (tid < 90) products[tid] = product = product + products[tid + 90];
barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 64) products[tid] = product = product + products[tid + 64];
+ if (tid < 45) products[tid] = product = product + products[tid + 45];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
- if (tid < 32)
+#ifdef CPU
+ if (tid < 13) smem[tid] = product = product + smem[tid + 32];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 16) smem[tid] = product = product + smem[tid + 16];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<8) smem[tid] = product = product + smem[tid + 8];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<4) smem[tid] = product = product + smem[tid + 4];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<2) smem[tid] = product = product + smem[tid + 2];
+ barrier(CLK_LOCAL_MEM_FENCE);
+#else
+ if (tid < 13)
{
smem[tid] = product = product + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
}
- barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
-#endif
smem[tid] = product = product + smem[tid + 16];
-#ifdef WAVE_SIZE_1
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 8)
- {
-#endif
smem[tid] = product = product + smem[tid + 8];
-#ifdef WAVE_SIZE_1
+ smem[tid] = product = product + smem[tid + 4];
+ smem[tid] = product = product + smem[tid + 2];
}
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 4)
- {
#endif
- smem[tid] = product = product + smem[tid + 4];
-#ifdef WAVE_SIZE_1
+
+ if (tid == 0){
+ product = product + smem[tid + 1];
+ labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+ }
+}
+
+//---------------------------------------------------------------------
+// Linear SVM based classification
+// 64x128 window, 9 bins and default parameters
+// 256 threads, 252 of them are used
+__kernel void classify_hists_252_kernel(
+ const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+ const int img_win_width, const int img_block_width,
+ const int win_block_stride_x, const int win_block_stride_y,
+ __global const float * block_hists, __global const float* coefs,
+ float free_coef, float threshold, __global uchar* labels)
+{
+ const int tid = get_local_id(0);
+ const int gidX = get_group_id(0);
+ const int gidY = get_group_id(1);
+
+ __global const float* hist = block_hists + (gidY * win_block_stride_y *
+ img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+ float product = 0.f;
+ if (tid < cdescr_width)
+ {
+ for (int i = 0; i < cdescr_height; i++)
+ product += coefs[i * cdescr_width + tid] *
+ hist[i * img_block_width * cblock_hist_size + tid];
}
+
+ __local float products[NTHREADS];
+
+ products[tid] = product;
+
barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 2)
- {
-#endif
+
+ if (tid < 128) products[tid] = product = product + products[tid + 128];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (tid < 64) products[tid] = product = product + products[tid + 64];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ volatile __local float* smem = products;
+#ifdef CPU
+ if(tid<32) smem[tid] = product = product + smem[tid + 32];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<16) smem[tid] = product = product + smem[tid + 16];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<8) smem[tid] = product = product + smem[tid + 8];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<4) smem[tid] = product = product + smem[tid + 4];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<2) smem[tid] = product = product + smem[tid + 2];
+ barrier(CLK_LOCAL_MEM_FENCE);
+#else
+ if (tid < 32)
+ {
+ smem[tid] = product = product + smem[tid + 32];
+ smem[tid] = product = product + smem[tid + 16];
+ smem[tid] = product = product + smem[tid + 8];
+ smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
-#ifdef WAVE_SIZE_1
}
- barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 1)
- {
#endif
- smem[tid] = product = product + smem[tid + 1];
+ if (tid == 0){
+ product = product + smem[tid + 1];
+ labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+ }
+}
+
+//---------------------------------------------------------------------
+// Linear SVM based classification
+// 256 threads
+__kernel void classify_hists_kernel(
+ const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
+ const int img_win_width, const int img_block_width,
+ const int win_block_stride_x, const int win_block_stride_y,
+ __global const float * block_hists, __global const float* coefs,
+ float free_coef, float threshold, __global uchar* labels)
+{
+ const int tid = get_local_id(0);
+ const int gidX = get_group_id(0);
+ const int gidY = get_group_id(1);
+
+ __global const float* hist = block_hists + (gidY * win_block_stride_y *
+ img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+ float product = 0.f;
+ for (int i = tid; i < cdescr_size; i += NTHREADS)
+ {
+ int offset_y = i / cdescr_width;
+ int offset_x = i - offset_y * cdescr_width;
+ product += coefs[i] *
+ hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
- if (tid == 0)
+ __local float products[NTHREADS];
+
+ products[tid] = product;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (tid < 128) products[tid] = product = product + products[tid + 128];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (tid < 64) products[tid] = product = product + products[tid + 64];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ volatile __local float* smem = products;
+#ifdef CPU
+ if(tid<32) smem[tid] = product = product + smem[tid + 32];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<16) smem[tid] = product = product + smem[tid + 16];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<8) smem[tid] = product = product + smem[tid + 8];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<4) smem[tid] = product = product + smem[tid + 4];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(tid<2) smem[tid] = product = product + smem[tid + 2];
+ barrier(CLK_LOCAL_MEM_FENCE);
+#else
+ if (tid < 32)
+ {
+ smem[tid] = product = product + smem[tid + 32];
+ smem[tid] = product = product + smem[tid + 16];
+ smem[tid] = product = product + smem[tid + 8];
+ smem[tid] = product = product + smem[tid + 4];
+ smem[tid] = product = product + smem[tid + 2];
+ }
+#endif
+ if (tid == 0){
+ smem[tid] = product = product + smem[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+ }
}
//----------------------------------------------------------------------------
// Extract descriptors
-__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
- const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
- __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_rows_kernel(
+ const int cblock_hist_size, const int descriptors_quadstep,
+ const int cdescr_size, const int cdescr_width, const int img_block_width,
+ const int win_block_stride_x, const int win_block_stride_y,
+ __global const float* block_hists, __global float* descriptors)
{
int tid = get_local_id(0);
int gidX = get_group_id(0);
int gidY = get_group_id(1);
// Get left top corner of the window in src
- __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+ __global const float* hist = block_hists + (gidY * win_block_stride_y *
+ img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
- __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+ __global float* descriptor = descriptors +
+ (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
// Copy elements from src to dst
for (int i = tid; i < cdescr_size; i += NTHREADS)
}
}
-__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
- const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
- const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_cols_kernel(
+ const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
+ const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
+ const int win_block_stride_x, const int win_block_stride_y,
+ __global const float* block_hists, __global float* descriptors)
{
int tid = get_local_id(0);
int gidX = get_group_id(0);
int gidY = get_group_id(1);
// Get left top corner of the window in src
- __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+ __global const float* hist = block_hists + (gidY * win_block_stride_y *
+ img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
- __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+ __global float* descriptor = descriptors +
+ (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
// Copy elements from src to dst
for (int i = tid; i < cdescr_size; i += NTHREADS)
int y = block_idx / cnblocks_win_x;
int x = block_idx - y * cnblocks_win_x;
- descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
+ descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
+ hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
}
}
//----------------------------------------------------------------------------
// Gradients computation
-__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
- const __global uchar4 * img, __global float * grad, __global uchar * qangle,
- const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC4_kernel(
+ const int height, const int width,
+ const int img_step, const int grad_quadstep, const int qangle_step,
+ const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+ const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
const int tid = get_local_id(0);
barrier(CLK_LOCAL_MEM_FENCE);
if (x < width)
{
- float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
- float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+ float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
+ sh_row[tid + 2 * (NTHREADS + 2)]);
+ float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
+ sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
float3 dx;
if (correct_gamma == 1)
}
}
-__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
- __global const uchar * img, __global float * grad, __global uchar * qangle,
- const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC1_kernel(
+ const int height, const int width,
+ const int img_step, const int grad_quadstep, const int qangle_step,
+ __global const uchar * img, __global float * grad, __global uchar * qangle,
+ const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
const int tid = get_local_id(0);
grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang);
grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;
}
-}
-
-//----------------------------------------------------------------------------
-// Resize
-
-__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
- int dst_offset, int src_offset, int dst_step, int src_step,
- int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- int sx = (int)floor(dx*ifx+0.5f);
- int sy = (int)floor(dy*ify+0.5f);
- sx = min(sx, src_cols-1);
- sy = min(sy, src_rows-1);
- int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
- int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
-
- if(dx<dst_cols && dy<dst_rows)
- dst[dpos] = src[spos];
-}
-
-__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
- int dst_offset, int src_offset, int dst_step, int src_step,
- int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- int sx = (int)floor(dx*ifx+0.5f);
- int sy = (int)floor(dy*ify+0.5f);
- sx = min(sx, src_cols-1);
- sy = min(sy, src_rows-1);
- int dpos = dst_offset + dy * dst_step + dx;
- int spos = src_offset + sy * src_step + sx;
-
- if(dx<dst_cols && dy<dst_rows)
- dst[dpos] = src[spos];
}
\ No newline at end of file