size_t globalThreads[3] = { width, height, 1 };
char correctGamma = (correct_gamma) ? 1 : 0;
int grad_quadstep = (int)grad.step >> 3;
- int qangle_step_shift = 0;
- int qangle_step = (int)qangle.step >> (1 + qangle_step_shift);
+ int qangle_elem_size = CV_ELEM_SIZE1(qangle.type());
+ int qangle_step = (int)qangle.step / (2 * qangle_elem_size);
int idx = 0;
idx = k.set(idx, height);
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)/block_stride_y;
int blocks_total = img_block_width * img_block_height;
- int qangle_step_shift = 0;
+ int qangle_elem_size = CV_ELEM_SIZE1(qangle.type());
int grad_quadstep = (int)grad.step >> 2;
- int qangle_step = (int)qangle.step >> qangle_step_shift;
+ int qangle_step = (int)qangle.step / qangle_elem_size;
int blocks_in_group = 4;
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
static bool ocl_compute(InputArray _img, Size win_stride, std::vector<float>& _descriptors, int descr_format, Size blockSize,
Size cellSize, int nbins, Size blockStride, Size winSize, float sigma, bool gammaCorrection, double L2HysThreshold)
{
- Size imgSize = _img.size();
+ Size imgSize = _img.size();
Size effect_size = imgSize;
UMat grad(imgSize, CV_32FC2);
- UMat qangle(imgSize, CV_8UC2);
+ int qangle_type = ocl::Device::getDefault().isIntel() ? CV_32SC2 : CV_8UC2;
+ UMat qangle(imgSize, qangle_type);
const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
Size imgSize = img.size();
Size effect_size = imgSize;
UMat grad(imgSize, CV_32FC2);
- UMat qangle(imgSize, CV_8UC2);
+ int qangle_type = ocl::Device::getDefault().isIntel() ? CV_32SC2 : CV_8UC2;
+ UMat qangle(imgSize, qangle_type);
const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
#define NTHREADS 256
#define CV_PI_F 3.1415926535897932384626433832795f
+#ifdef INTEL_DEVICE
+#define QANGLE_TYPE int
+#define QANGLE_TYPE2 int2
+#else
+#define QANGLE_TYPE uchar
+#define QANGLE_TYPE2 uchar2
+#endif
+
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
const int cnbins, const int cblock_hist_size, const int img_block_width,
const int blocks_in_group, const int blocks_total,
const int grad_quadstep, const int qangle_step,
- __global const float* grad, __global const uchar* qangle,
+ __global const float* grad, __global const QANGLE_TYPE* qangle,
__global const float* gauss_w_lut,
__global float* block_hists, __local float* smem)
{
__global const float* grad_ptr = (gid < blocks_total) ?
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
- __global const uchar* qangle_ptr = (gid < blocks_total) ?
+ __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
- uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+ QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
grad_ptr += grad_quadstep;
qangle_ptr += qangle_step;
__kernel void compute_gradients_8UC4_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
- const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+ const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
__kernel void compute_gradients_8UC1_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
- __global const uchar * img, __global float * grad, __global uchar * qangle,
+ __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);