modules/ocl/src/opencl/objdetect_hog.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Wenju He, wenju@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other materials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46 #define CELL_WIDTH 8
  47 #define CELL_HEIGHT 8
  48 #define CELLS_PER_BLOCK_X 2
  49 #define CELLS_PER_BLOCK_Y 2
  50 #define NTHREADS 256
  51 #define CV_PI_F 3.1415926535897932384626433832795f
  52
  53 //----------------------------------------------------------------------------
  54 // Histogram computation
  55 // 12 threads for a cell, 12x4 threads per block
  56 // Use pre-computed gaussian and interp_weight lookup tables
  57 __kernel void compute_hists_lut_kernel(
  58     const int cblock_stride_x, const int cblock_stride_y,
  59     const int cnbins, const int cblock_hist_size, const int img_block_width,
  60     const int blocks_in_group, const int blocks_total,
  61     const int grad_quadstep, const int qangle_step,
  62     __global const float* grad, __global const uchar* qangle,
  63     __global const float* gauss_w_lut,
  64     __global float* block_hists, __local float* smem)
  65 {
  66     const int lx = get_local_id(0);
  67     const int lp = lx / 24; /* local group id */
  68     const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
  69     const int gidY = gid / img_block_width;
  70     const int gidX = gid - gidY * img_block_width;
  71
  72     const int lidX = lx - lp * 24;
  73     const int lidY = get_local_id(1);
  74
  75     const int cell_x = lidX / 12;
  76     const int cell_y = lidY;
  77     const int cell_thread_x = lidX - cell_x * 12;
  78
  79     __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
  80         CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
  81     __local float* final_hist = hists + cnbins *
  82         (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
  83
  84     const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
  85     const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
  86
  87     __global const float* grad_ptr = (gid < blocks_total) ?
  88         grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
  89     __global const uchar* qangle_ptr = (gid < blocks_total) ?
  90         qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
  91
  92     __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
  93         cell_thread_x;
  94     for (int bin_id = 0; bin_id < cnbins; ++bin_id)
  95         hist[bin_id * 48] = 0.f;
  96
  97     const int dist_x = -4 + cell_thread_x - 4 * cell_x;
  98     const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
  99
 100     const int dist_y_begin = -4 - 4 * lidY;
 101     for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
 102     {
 103         float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
 104         uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
 105
 106         grad_ptr += grad_quadstep;
 107         qangle_ptr += qangle_step;
 108
 109         int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
 110
 111         int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
 112         float gaussian = gauss_w_lut[idx];
 113         idx = (dist_y + 8) * 16 + (dist_x + 8);
 114         float interp_weight = gauss_w_lut[256+idx];
 115
 116         hist[bin.x * 48] += gaussian * interp_weight * vote.x;
 117         hist[bin.y * 48] += gaussian * interp_weight * vote.y;
 118     }
 119     barrier(CLK_LOCAL_MEM_FENCE);
 120
 121     volatile __local float* hist_ = hist;
 122     for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
 123     {
 124         if (cell_thread_x < 6)
 125             hist_[0] += hist_[6];
 126         barrier(CLK_LOCAL_MEM_FENCE);
 127         if (cell_thread_x < 3)
 128             hist_[0] += hist_[3];
 129 #ifdef CPU
 130         barrier(CLK_LOCAL_MEM_FENCE);
 131 #endif
 132         if (cell_thread_x == 0)
 133             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
 134                 hist_[0] + hist_[1] + hist_[2];
 135     }
 136 #ifdef CPU
 137     barrier(CLK_LOCAL_MEM_FENCE);
 138 #endif
 139
 140     int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
 141     if ((tid < cblock_hist_size) && (gid < blocks_total))
 142     {
 143         __global float* block_hist = block_hists +
 144             (gidY * img_block_width + gidX) * cblock_hist_size;
 145         block_hist[tid] = final_hist[tid];
 146     }
 147 }
 148
 149 //-------------------------------------------------------------
 150 //  Normalization of histograms via L2Hys_norm
 151 //  optimized for the case of 9 bins
 152 __kernel void normalize_hists_36_kernel(__global float* block_hists,
 153                                         const float threshold, __local float *squares)
 154 {
 155     const int tid = get_local_id(0);
 156     const int gid = get_global_id(0);
 157     const int bid = tid / 36;      /* block-hist id, (0 - 6) */
 158     const int boffset = bid * 36;  /* block-hist offset in the work-group */
 159     const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
 160
 161     float elem = block_hists[gid];
 162     squares[tid] = elem * elem;
 163     barrier(CLK_LOCAL_MEM_FENCE);
 164
 165     __local float* smem = squares + boffset;
 166     float sum = smem[hid];
 167     if (hid < 18)
 168         smem[hid] = sum = sum + smem[hid + 18];
 169     barrier(CLK_LOCAL_MEM_FENCE);
 170     if (hid < 9)
 171         smem[hid] = sum = sum + smem[hid + 9];
 172     barrier(CLK_LOCAL_MEM_FENCE);
 173     if (hid < 4)
 174         smem[hid] = sum + smem[hid + 4];
 175     barrier(CLK_LOCAL_MEM_FENCE);
 176     sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
 177
 178     elem = elem / (sqrt(sum) + 3.6f);
 179     elem = min(elem, threshold);
 180
 181     barrier(CLK_LOCAL_MEM_FENCE);
 182     squares[tid] = elem * elem;
 183     barrier(CLK_LOCAL_MEM_FENCE);
 184
 185     sum = smem[hid];
 186     if (hid < 18)
 187       smem[hid] = sum = sum + smem[hid + 18];
 188     barrier(CLK_LOCAL_MEM_FENCE);
 189     if (hid < 9)
 190         smem[hid] = sum = sum + smem[hid + 9];
 191     barrier(CLK_LOCAL_MEM_FENCE);
 192     if (hid < 4)
 193         smem[hid] = sum + smem[hid + 4];
 194     barrier(CLK_LOCAL_MEM_FENCE);
 195     sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
 196
 197     block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
 198 }
 199
 200 //-------------------------------------------------------------
 201 //  Normalization of histograms via L2Hys_norm
 202 //
 203 float reduce_smem(volatile __local float* smem, int size)
 204 {
 205     unsigned int tid = get_local_id(0);
 206     float sum = smem[tid];
 207
 208     if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
 209         barrier(CLK_LOCAL_MEM_FENCE); }
 210     if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
 211         barrier(CLK_LOCAL_MEM_FENCE); }
 212     if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
 213         barrier(CLK_LOCAL_MEM_FENCE); }
 214 #ifdef CPU
 215     if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
 216         barrier(CLK_LOCAL_MEM_FENCE); }
 217     if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
 218         barrier(CLK_LOCAL_MEM_FENCE); }
 219     if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
 220         barrier(CLK_LOCAL_MEM_FENCE); }
 221     if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
 222         barrier(CLK_LOCAL_MEM_FENCE); }
 223     if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
 224         barrier(CLK_LOCAL_MEM_FENCE); }
 225     if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
 226         barrier(CLK_LOCAL_MEM_FENCE); }
 227 #else
 228     if (tid < 32)
 229     {
 230         if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
 231 #if WAVE_SIZE < 32
 232     } barrier(CLK_LOCAL_MEM_FENCE);
 233     if (tid < 16) {
 234 #endif
 235         if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
 236         if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
 237         if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
 238         if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
 239         if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
 240     }
 241 #endif
 242
 243     return sum;
 244 }
 245
 246 __kernel void normalize_hists_kernel(
 247     const int nthreads, const int block_hist_size, const int img_block_width,
 248     __global float* block_hists, const float threshold, __local float *squares)
 249 {
 250     const int tid = get_local_id(0);
 251     const int gidX = get_group_id(0);
 252     const int gidY = get_group_id(1);
 253
 254     __global float* hist = block_hists + (gidY * img_block_width + gidX) *
 255         block_hist_size + tid;
 256
 257     float elem = 0.f;
 258     if (tid < block_hist_size)
 259         elem = hist[0];
 260
 261     squares[tid] = elem * elem;
 262
 263     barrier(CLK_LOCAL_MEM_FENCE);
 264     float sum = reduce_smem(squares, nthreads);
 265
 266     float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
 267     elem = min(elem * scale, threshold);
 268
 269     barrier(CLK_LOCAL_MEM_FENCE);
 270     squares[tid] = elem * elem;
 271
 272     barrier(CLK_LOCAL_MEM_FENCE);
 273     sum = reduce_smem(squares, nthreads);
 274     scale = 1.0f / (sqrt(sum) + 1e-3f);
 275
 276     if (tid < block_hist_size)
 277         hist[0] = elem * scale;
 278 }
 279
 280 //---------------------------------------------------------------------
 281 //  Linear SVM based classification
 282 //  48x96 window, 9 bins and default parameters
 283 //  180 threads, each thread corresponds to a bin in a row
 284 __kernel void classify_hists_180_kernel(
 285     const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
 286     const int img_win_width, const int img_block_width,
 287     const int win_block_stride_x, const int win_block_stride_y,
 288     __global const float * block_hists, __global const float* coefs,
 289     float free_coef, float threshold, __global uchar* labels)
 290 {
 291     const int tid = get_local_id(0);
 292     const int gidX = get_group_id(0);
 293     const int gidY = get_group_id(1);
 294
 295     __global const float* hist = block_hists + (gidY * win_block_stride_y *
 296         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 297
 298     float product = 0.f;
 299
 300     for (int i = 0; i < cdescr_height; i++)
 301     {
 302         product += coefs[i * cdescr_width + tid] *
 303             hist[i * img_block_width * cblock_hist_size + tid];
 304     }
 305
 306     __local float products[180];
 307
 308     products[tid] = product;
 309
 310     barrier(CLK_LOCAL_MEM_FENCE);
 311
 312     if (tid < 90) products[tid] = product = product + products[tid + 90];
 313     barrier(CLK_LOCAL_MEM_FENCE);
 314
 315     if (tid < 45) products[tid] = product = product + products[tid + 45];
 316     barrier(CLK_LOCAL_MEM_FENCE);
 317
 318     volatile __local float* smem = products;
 319 #ifdef CPU
 320     if (tid < 13) smem[tid] = product = product + smem[tid + 32];
 321         barrier(CLK_LOCAL_MEM_FENCE);
 322     if (tid < 16) smem[tid] = product = product + smem[tid + 16];
 323         barrier(CLK_LOCAL_MEM_FENCE);
 324         if(tid<8) smem[tid] = product = product + smem[tid + 8];
 325         barrier(CLK_LOCAL_MEM_FENCE);
 326         if(tid<4) smem[tid] = product = product + smem[tid + 4];
 327         barrier(CLK_LOCAL_MEM_FENCE);
 328         if(tid<2) smem[tid] = product = product + smem[tid + 2];
 329         barrier(CLK_LOCAL_MEM_FENCE);
 330 #else
 331     if (tid < 13)
 332     {
 333         smem[tid] = product = product + smem[tid + 32];
 334     }
 335 #if WAVE_SIZE < 32
 336     barrier(CLK_LOCAL_MEM_FENCE);
 337 #endif
 338     if (tid < 16)
 339     {
 340         smem[tid] = product = product + smem[tid + 16];
 341         smem[tid] = product = product + smem[tid + 8];
 342         smem[tid] = product = product + smem[tid + 4];
 343         smem[tid] = product = product + smem[tid + 2];
 344     }
 345 #endif
 346
 347     if (tid == 0){
 348                 product = product + smem[tid + 1];
 349         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
 350         }
 351 }
 352
 353 //---------------------------------------------------------------------
 354 //  Linear SVM based classification
 355 //  64x128 window, 9 bins and default parameters
 356 //  256 threads, 252 of them are used
 357 __kernel void classify_hists_252_kernel(
 358     const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
 359     const int img_win_width, const int img_block_width,
 360     const int win_block_stride_x, const int win_block_stride_y,
 361     __global const float * block_hists, __global const float* coefs,
 362     float free_coef, float threshold, __global uchar* labels)
 363 {
 364     const int tid = get_local_id(0);
 365     const int gidX = get_group_id(0);
 366     const int gidY = get_group_id(1);
 367
 368     __global const float* hist = block_hists + (gidY * win_block_stride_y *
 369         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 370
 371     float product = 0.f;
 372     if (tid < cdescr_width)
 373     {
 374         for (int i = 0; i < cdescr_height; i++)
 375             product += coefs[i * cdescr_width + tid] *
 376                 hist[i * img_block_width * cblock_hist_size + tid];
 377     }
 378
 379     __local float products[NTHREADS];
 380
 381     products[tid] = product;
 382
 383     barrier(CLK_LOCAL_MEM_FENCE);
 384
 385     if (tid < 128) products[tid] = product = product + products[tid + 128];
 386     barrier(CLK_LOCAL_MEM_FENCE);
 387
 388     if (tid < 64) products[tid] = product = product + products[tid + 64];
 389     barrier(CLK_LOCAL_MEM_FENCE);
 390
 391         volatile __local float* smem = products;
 392 #ifdef CPU
 393         if(tid<32) smem[tid] = product = product + smem[tid + 32];
 394         barrier(CLK_LOCAL_MEM_FENCE);
 395         if(tid<16) smem[tid] = product = product + smem[tid + 16];
 396         barrier(CLK_LOCAL_MEM_FENCE);
 397         if(tid<8) smem[tid] = product = product + smem[tid + 8];
 398         barrier(CLK_LOCAL_MEM_FENCE);
 399         if(tid<4) smem[tid] = product = product + smem[tid + 4];
 400         barrier(CLK_LOCAL_MEM_FENCE);
 401         if(tid<2) smem[tid] = product = product + smem[tid + 2];
 402         barrier(CLK_LOCAL_MEM_FENCE);
 403 #else
 404     if (tid < 32)
 405     {
 406         smem[tid] = product = product + smem[tid + 32];
 407 #if WAVE_SIZE < 32
 408     } barrier(CLK_LOCAL_MEM_FENCE);
 409     if (tid < 16) {
 410 #endif
 411         smem[tid] = product = product + smem[tid + 16];
 412         smem[tid] = product = product + smem[tid + 8];
 413         smem[tid] = product = product + smem[tid + 4];
 414         smem[tid] = product = product + smem[tid + 2];
 415     }
 416 #endif
 417     if (tid == 0){
 418                 product = product + smem[tid + 1];
 419         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
 420         }
 421 }
 422
 423 //---------------------------------------------------------------------
 424 //  Linear SVM based classification
 425 //  256 threads
 426 __kernel void classify_hists_kernel(
 427     const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
 428     const int img_win_width, const int img_block_width,
 429     const int win_block_stride_x, const int win_block_stride_y,
 430     __global const float * block_hists, __global const float* coefs,
 431     float free_coef, float threshold, __global uchar* labels)
 432 {
 433     const int tid = get_local_id(0);
 434     const int gidX = get_group_id(0);
 435     const int gidY = get_group_id(1);
 436
 437     __global const float* hist = block_hists + (gidY * win_block_stride_y *
 438         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 439
 440     float product = 0.f;
 441     for (int i = tid; i < cdescr_size; i += NTHREADS)
 442     {
 443         int offset_y = i / cdescr_width;
 444         int offset_x = i - offset_y * cdescr_width;
 445         product += coefs[i] *
 446             hist[offset_y * img_block_width * cblock_hist_size + offset_x];
 447     }
 448
 449     __local float products[NTHREADS];
 450
 451     products[tid] = product;
 452
 453     barrier(CLK_LOCAL_MEM_FENCE);
 454
 455     if (tid < 128) products[tid] = product = product + products[tid + 128];
 456     barrier(CLK_LOCAL_MEM_FENCE);
 457
 458     if (tid < 64) products[tid] = product = product + products[tid + 64];
 459     barrier(CLK_LOCAL_MEM_FENCE);
 460
 461         volatile __local float* smem = products;
 462 #ifdef CPU
 463         if(tid<32) smem[tid] = product = product + smem[tid + 32];
 464         barrier(CLK_LOCAL_MEM_FENCE);
 465         if(tid<16) smem[tid] = product = product + smem[tid + 16];
 466         barrier(CLK_LOCAL_MEM_FENCE);
 467         if(tid<8) smem[tid] = product = product + smem[tid + 8];
 468         barrier(CLK_LOCAL_MEM_FENCE);
 469         if(tid<4) smem[tid] = product = product + smem[tid + 4];
 470         barrier(CLK_LOCAL_MEM_FENCE);
 471         if(tid<2) smem[tid] = product = product + smem[tid + 2];
 472         barrier(CLK_LOCAL_MEM_FENCE);
 473 #else
 474     if (tid < 32)
 475     {
 476         smem[tid] = product = product + smem[tid + 32];
 477 #if WAVE_SIZE < 32
 478     } barrier(CLK_LOCAL_MEM_FENCE);
 479     if (tid < 16) {
 480 #endif
 481         smem[tid] = product = product + smem[tid + 16];
 482         smem[tid] = product = product + smem[tid + 8];
 483         smem[tid] = product = product + smem[tid + 4];
 484         smem[tid] = product = product + smem[tid + 2];
 485     }
 486 #endif
 487     if (tid == 0){
 488                 smem[tid] = product = product + smem[tid + 1];
 489         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
 490         }
 491 }
 492
 493 //----------------------------------------------------------------------------
 494 // Extract descriptors
 495
 496 __kernel void extract_descrs_by_rows_kernel(
 497     const int cblock_hist_size, const int descriptors_quadstep,
 498     const int cdescr_size, const int cdescr_width, const int img_block_width,
 499     const int win_block_stride_x, const int win_block_stride_y,
 500     __global const float* block_hists, __global float* descriptors)
 501 {
 502     int tid = get_local_id(0);
 503     int gidX = get_group_id(0);
 504     int gidY = get_group_id(1);
 505
 506     // Get left top corner of the window in src
 507     __global const float* hist = block_hists + (gidY * win_block_stride_y *
 508         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 509
 510     // Get left top corner of the window in dst
 511     __global float* descriptor = descriptors +
 512         (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 513
 514     // Copy elements from src to dst
 515     for (int i = tid; i < cdescr_size; i += NTHREADS)
 516     {
 517         int offset_y = i / cdescr_width;
 518         int offset_x = i - offset_y * cdescr_width;
 519         descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
 520     }
 521 }
 522
 523 __kernel void extract_descrs_by_cols_kernel(
 524     const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
 525     const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
 526     const int win_block_stride_x, const int win_block_stride_y,
 527     __global const float* block_hists, __global float* descriptors)
 528 {
 529     int tid = get_local_id(0);
 530     int gidX = get_group_id(0);
 531     int gidY = get_group_id(1);
 532
 533     // Get left top corner of the window in src
 534     __global const float* hist = block_hists +  (gidY * win_block_stride_y *
 535         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 536
 537     // Get left top corner of the window in dst
 538     __global float* descriptor = descriptors +
 539         (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 540
 541     // Copy elements from src to dst
 542     for (int i = tid; i < cdescr_size; i += NTHREADS)
 543     {
 544         int block_idx = i / cblock_hist_size;
 545         int idx_in_block = i - block_idx * cblock_hist_size;
 546
 547         int y = block_idx / cnblocks_win_x;
 548         int x = block_idx - y * cnblocks_win_x;
 549
 550         descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
 551             hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
 552     }
 553 }
 554
 555 //----------------------------------------------------------------------------
 556 // Gradients computation
 557
 558 __kernel void compute_gradients_8UC4_kernel(
 559     const int height, const int width,
 560     const int img_step, const int grad_quadstep, const int qangle_step,
 561     const __global uchar4 * img, __global float * grad, __global uchar * qangle,
 562     const float angle_scale, const char correct_gamma, const int cnbins)
 563 {
 564     const int x = get_global_id(0);
 565     const int tid = get_local_id(0);
 566     const int gSizeX = get_local_size(0);
 567     const int gidX = get_group_id(0);
 568     const int gidY = get_group_id(1);
 569
 570     __global const uchar4* row = img + gidY * img_step;
 571
 572     __local float sh_row[(NTHREADS + 2) * 3];
 573
 574     uchar4 val;
 575     if (x < width)
 576         val = row[x];
 577     else
 578         val = row[width - 2];
 579
 580     sh_row[tid + 1] = val.x;
 581     sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
 582     sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
 583
 584     if (tid == 0)
 585     {
 586         val = row[max(x - 1, 1)];
 587         sh_row[0] = val.x;
 588         sh_row[(NTHREADS + 2)] = val.y;
 589         sh_row[2 * (NTHREADS + 2)] = val.z;
 590     }
 591
 592     if (tid == gSizeX - 1)
 593     {
 594         val = row[min(x + 1, width - 2)];
 595         sh_row[gSizeX + 1] = val.x;
 596         sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
 597         sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
 598     }
 599
 600     barrier(CLK_LOCAL_MEM_FENCE);
 601     if (x < width)
 602     {
 603         float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
 604             sh_row[tid + 2 * (NTHREADS + 2)]);
 605         float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
 606             sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
 607
 608         float3 dx;
 609         if (correct_gamma == 1)
 610             dx = sqrt(b) - sqrt(a);
 611         else
 612             dx = b - a;
 613
 614         float3 dy = (float3) 0.f;
 615
 616         if (gidY > 0 && gidY < height - 1)
 617         {
 618             a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
 619             b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
 620
 621             if (correct_gamma == 1)
 622                 dy = sqrt(b) - sqrt(a);
 623             else
 624                 dy = b - a;
 625         }
 626
 627         float best_dx = dx.x;
 628         float best_dy = dy.x;
 629
 630         float mag0 = dx.x * dx.x + dy.x * dy.x;
 631         float mag1 = dx.y * dx.y + dy.y * dy.y;
 632         if (mag0 < mag1)
 633         {
 634             best_dx = dx.y;
 635             best_dy = dy.y;
 636             mag0 = mag1;
 637         }
 638
 639         mag1 = dx.z * dx.z + dy.z * dy.z;
 640         if (mag0 < mag1)
 641         {
 642             best_dx = dx.z;
 643             best_dy = dy.z;
 644             mag0 = mag1;
 645         }
 646
 647         mag0 = sqrt(mag0);
 648
 649         float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
 650         int hidx = (int)floor(ang);
 651         ang -= hidx;
 652         hidx = (hidx + cnbins) % cnbins;
 653
 654         qangle[(gidY * qangle_step + x) << 1] = hidx;
 655         qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
 656         grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
 657         grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
 658     }
 659 }
 660
 661 __kernel void compute_gradients_8UC1_kernel(
 662     const int height, const int width,
 663     const int img_step, const int grad_quadstep, const int qangle_step,
 664     __global const uchar * img, __global float * grad, __global uchar * qangle,
 665     const float angle_scale, const char correct_gamma, const int cnbins)
 666 {
 667     const int x = get_global_id(0);
 668     const int tid = get_local_id(0);
 669     const int gSizeX = get_local_size(0);
 670     const int gidX = get_group_id(0);
 671     const int gidY = get_group_id(1);
 672
 673     __global const uchar* row = img + gidY * img_step;
 674
 675     __local float sh_row[NTHREADS + 2];
 676
 677     if (x < width)
 678         sh_row[tid + 1] = row[x];
 679     else
 680         sh_row[tid + 1] = row[width - 2];
 681
 682     if (tid == 0)
 683         sh_row[0] = row[max(x - 1, 1)];
 684
 685     if (tid == gSizeX - 1)
 686         sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
 687
 688     barrier(CLK_LOCAL_MEM_FENCE);
 689     if (x < width)
 690     {
 691         float dx;
 692
 693         if (correct_gamma == 1)
 694             dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
 695         else
 696             dx = sh_row[tid + 2] - sh_row[tid];
 697
 698         float dy = 0.f;
 699         if (gidY > 0 && gidY < height - 1)
 700         {
 701             float a = (float) img[ (gidY + 1) * img_step + x ];
 702             float b = (float) img[ (gidY - 1) * img_step + x ];
 703             if (correct_gamma == 1)
 704                 dy = sqrt(a) - sqrt(b);
 705             else
 706                 dy = a - b;
 707         }
 708         float mag = sqrt(dx * dx + dy * dy);
 709
 710         float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
 711         int hidx = (int)floor(ang);
 712         ang -= hidx;
 713         hidx = (hidx + cnbins) % cnbins;
 714
 715         qangle[ (gidY * qangle_step + x) << 1 ]     = hidx;
 716         qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
 717         grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
 718         grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
 719     }
 720 }