1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Wenju He, wenju@multicorewareinc.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other materials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
49 #define CELLS_PER_BLOCK_X 2
50 #define CELLS_PER_BLOCK_Y 2
52 #define CV_PI_F 3.1415926535897932384626433832795f
54 //----------------------------------------------------------------------------
55 // Histogram computation
56 // 12 threads for a cell, 12x4 threads per block
57 __kernel void compute_hists_kernel(
58 const int cblock_stride_x, const int cblock_stride_y,
59 const int cnbins, const int cblock_hist_size, const int img_block_width,
60 const int blocks_in_group, const int blocks_total,
61 const int grad_quadstep, const int qangle_step,
62 __global const float* grad, __global const uchar* qangle,
63 const float scale, __global float* block_hists, __local float* smem)
65 const int lx = get_local_id(0);
66 const int lp = lx / 24; /* local group id */
67 const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
68 const int gidY = gid / img_block_width;
69 const int gidX = gid - gidY * img_block_width;
71 const int lidX = lx - lp * 24;
72 const int lidY = get_local_id(1);
74 const int cell_x = lidX / 12;
75 const int cell_y = lidY;
76 const int cell_thread_x = lidX - cell_x * 12;
78 __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
79 CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
80 __local float* final_hist = hists + cnbins *
81 (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
83 const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
84 const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
86 __global const float* grad_ptr = (gid < blocks_total) ?
87 grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
88 __global const uchar* qangle_ptr = (gid < blocks_total) ?
89 qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
91 __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
93 for (int bin_id = 0; bin_id < cnbins; ++bin_id)
94 hist[bin_id * 48] = 0.f;
96 const int dist_x = -4 + cell_thread_x - 4 * cell_x;
97 const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
99 const int dist_y_begin = -4 - 4 * lidY;
100 for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
102 float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
103 uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
105 grad_ptr += grad_quadstep;
106 qangle_ptr += qangle_step;
108 int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
110 float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x *
111 dist_center_x) * scale);
112 float interp_weight = (8.f - fabs(dist_y + 0.5f)) *
113 (8.f - fabs(dist_x + 0.5f)) / 64.f;
115 hist[bin.x * 48] += gaussian * interp_weight * vote.x;
116 hist[bin.y * 48] += gaussian * interp_weight * vote.y;
118 barrier(CLK_LOCAL_MEM_FENCE);
120 volatile __local float* hist_ = hist;
121 for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
123 if (cell_thread_x < 6)
124 hist_[0] += hist_[6];
125 barrier(CLK_LOCAL_MEM_FENCE);
126 if (cell_thread_x < 3)
127 hist_[0] += hist_[3];
129 barrier(CLK_LOCAL_MEM_FENCE);
131 if (cell_thread_x == 0)
132 final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
133 hist_[0] + hist_[1] + hist_[2];
136 barrier(CLK_LOCAL_MEM_FENCE);
139 int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
140 if ((tid < cblock_hist_size) && (gid < blocks_total))
142 __global float* block_hist = block_hists +
143 (gidY * img_block_width + gidX) * cblock_hist_size;
144 block_hist[tid] = final_hist[tid];
148 //-------------------------------------------------------------
149 // Normalization of histograms via L2Hys_norm
151 float reduce_smem(volatile __local float* smem, int size)
153 unsigned int tid = get_local_id(0);
154 float sum = smem[tid];
158 if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
159 barrier(CLK_LOCAL_MEM_FENCE);
163 if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
164 barrier(CLK_LOCAL_MEM_FENCE);
168 if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
169 barrier(CLK_LOCAL_MEM_FENCE);
174 if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
175 #if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
177 barrier(CLK_LOCAL_MEM_FENCE);
181 if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
184 barrier(CLK_LOCAL_MEM_FENCE);
188 if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
191 barrier(CLK_LOCAL_MEM_FENCE);
195 if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
198 barrier(CLK_LOCAL_MEM_FENCE);
202 if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
205 barrier(CLK_LOCAL_MEM_FENCE);
209 if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
212 barrier(CLK_LOCAL_MEM_FENCE);
218 __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
219 __global float* block_hists, const float threshold, __local float *squares)
221 const int tid = get_local_id(0);
222 const int gidX = get_group_id(0);
223 const int gidY = get_group_id(1);
225 __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
228 if (tid < block_hist_size)
231 squares[tid] = elem * elem;
233 barrier(CLK_LOCAL_MEM_FENCE);
234 float sum = reduce_smem(squares, nthreads);
236 float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
237 elem = min(elem * scale, threshold);
239 barrier(CLK_LOCAL_MEM_FENCE);
240 squares[tid] = elem * elem;
242 barrier(CLK_LOCAL_MEM_FENCE);
243 sum = reduce_smem(squares, nthreads);
244 scale = 1.0f / (sqrt(sum) + 1e-3f);
246 if (tid < block_hist_size)
247 hist[0] = elem * scale;
250 //---------------------------------------------------------------------
251 // Linear SVM based classification
253 __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
254 const int img_win_width, const int img_block_width,
255 const int win_block_stride_x, const int win_block_stride_y,
256 __global const float * block_hists, __global const float* coefs,
257 float free_coef, float threshold, __global uchar* labels)
259 const int tid = get_local_id(0);
260 const int gidX = get_group_id(0);
261 const int gidY = get_group_id(1);
263 __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
266 for (int i = tid; i < cdescr_size; i += NTHREADS)
268 int offset_y = i / cdescr_width;
269 int offset_x = i - offset_y * cdescr_width;
270 product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
273 __local float products[NTHREADS];
275 products[tid] = product;
277 barrier(CLK_LOCAL_MEM_FENCE);
279 if (tid < 128) products[tid] = product = product + products[tid + 128];
280 barrier(CLK_LOCAL_MEM_FENCE);
282 if (tid < 64) products[tid] = product = product + products[tid + 64];
283 barrier(CLK_LOCAL_MEM_FENCE);
285 volatile __local float* smem = products;
288 smem[tid] = product = product + smem[tid + 32];
289 #if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
291 barrier(CLK_LOCAL_MEM_FENCE);
295 smem[tid] = product = product + smem[tid + 16];
298 barrier(CLK_LOCAL_MEM_FENCE);
302 smem[tid] = product = product + smem[tid + 8];
305 barrier(CLK_LOCAL_MEM_FENCE);
309 smem[tid] = product = product + smem[tid + 4];
312 barrier(CLK_LOCAL_MEM_FENCE);
316 smem[tid] = product = product + smem[tid + 2];
319 barrier(CLK_LOCAL_MEM_FENCE);
323 smem[tid] = product = product + smem[tid + 1];
327 labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
330 //----------------------------------------------------------------------------
331 // Extract descriptors
333 __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
334 const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
335 __global const float* block_hists, __global float* descriptors)
337 int tid = get_local_id(0);
338 int gidX = get_group_id(0);
339 int gidY = get_group_id(1);
341 // Get left top corner of the window in src
342 __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
344 // Get left top corner of the window in dst
345 __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
347 // Copy elements from src to dst
348 for (int i = tid; i < cdescr_size; i += NTHREADS)
350 int offset_y = i / cdescr_width;
351 int offset_x = i - offset_y * cdescr_width;
352 descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
356 __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
357 const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
358 const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
360 int tid = get_local_id(0);
361 int gidX = get_group_id(0);
362 int gidY = get_group_id(1);
364 // Get left top corner of the window in src
365 __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
367 // Get left top corner of the window in dst
368 __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
370 // Copy elements from src to dst
371 for (int i = tid; i < cdescr_size; i += NTHREADS)
373 int block_idx = i / cblock_hist_size;
374 int idx_in_block = i - block_idx * cblock_hist_size;
376 int y = block_idx / cnblocks_win_x;
377 int x = block_idx - y * cnblocks_win_x;
379 descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
383 //----------------------------------------------------------------------------
384 // Gradients computation
386 __kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
387 const __global uchar4 * img, __global float * grad, __global uchar * qangle,
388 const float angle_scale, const char correct_gamma, const int cnbins)
390 const int x = get_global_id(0);
391 const int tid = get_local_id(0);
392 const int gSizeX = get_local_size(0);
393 const int gidX = get_group_id(0);
394 const int gidY = get_group_id(1);
396 __global const uchar4* row = img + gidY * img_step;
398 __local float sh_row[(NTHREADS + 2) * 3];
404 val = row[width - 2];
406 sh_row[tid + 1] = val.x;
407 sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
408 sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
412 val = row[max(x - 1, 1)];
414 sh_row[(NTHREADS + 2)] = val.y;
415 sh_row[2 * (NTHREADS + 2)] = val.z;
418 if (tid == gSizeX - 1)
420 val = row[min(x + 1, width - 2)];
421 sh_row[gSizeX + 1] = val.x;
422 sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
423 sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
426 barrier(CLK_LOCAL_MEM_FENCE);
429 float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
430 float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
433 if (correct_gamma == 1)
434 dx = sqrt(b) - sqrt(a);
438 float3 dy = (float3) 0.f;
440 if (gidY > 0 && gidY < height - 1)
442 a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
443 b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
445 if (correct_gamma == 1)
446 dy = sqrt(b) - sqrt(a);
451 float best_dx = dx.x;
452 float best_dy = dy.x;
454 float mag0 = dx.x * dx.x + dy.x * dy.x;
455 float mag1 = dx.y * dx.y + dy.y * dy.y;
463 mag1 = dx.z * dx.z + dy.z * dy.z;
473 float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
474 int hidx = (int)floor(ang);
476 hidx = (hidx + cnbins) % cnbins;
478 qangle[(gidY * qangle_step + x) << 1] = hidx;
479 qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
480 grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
481 grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
485 __kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
486 __global const uchar * img, __global float * grad, __global uchar * qangle,
487 const float angle_scale, const char correct_gamma, const int cnbins)
489 const int x = get_global_id(0);
490 const int tid = get_local_id(0);
491 const int gSizeX = get_local_size(0);
492 const int gidX = get_group_id(0);
493 const int gidY = get_group_id(1);
495 __global const uchar* row = img + gidY * img_step;
497 __local float sh_row[NTHREADS + 2];
500 sh_row[tid + 1] = row[x];
502 sh_row[tid + 1] = row[width - 2];
505 sh_row[0] = row[max(x - 1, 1)];
507 if (tid == gSizeX - 1)
508 sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
510 barrier(CLK_LOCAL_MEM_FENCE);
515 if (correct_gamma == 1)
516 dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
518 dx = sh_row[tid + 2] - sh_row[tid];
521 if (gidY > 0 && gidY < height - 1)
523 float a = (float) img[ (gidY + 1) * img_step + x ];
524 float b = (float) img[ (gidY - 1) * img_step + x ];
525 if (correct_gamma == 1)
526 dy = sqrt(a) - sqrt(b);
530 float mag = sqrt(dx * dx + dy * dy);
532 float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
533 int hidx = (int)floor(ang);
535 hidx = (hidx + cnbins) % cnbins;
537 qangle[ (gidY * qangle_step + x) << 1 ] = hidx;
538 qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
539 grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang);
540 grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;
544 //----------------------------------------------------------------------------
547 __kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
548 int dst_offset, int src_offset, int dst_step, int src_step,
549 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
551 int dx = get_global_id(0);
552 int dy = get_global_id(1);
554 int sx = (int)floor(dx*ifx+0.5f);
555 int sy = (int)floor(dy*ify+0.5f);
556 sx = min(sx, src_cols-1);
557 sy = min(sy, src_rows-1);
558 int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
559 int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
561 if(dx<dst_cols && dy<dst_rows)
562 dst[dpos] = src[spos];
565 __kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
566 int dst_offset, int src_offset, int dst_step, int src_step,
567 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
569 int dx = get_global_id(0);
570 int dy = get_global_id(1);
572 int sx = (int)floor(dx*ifx+0.5f);
573 int sy = (int)floor(dy*ify+0.5f);
574 sx = min(sx, src_cols-1);
575 sy = min(sy, src_rows-1);
576 int dpos = dst_offset + dy * dst_step + dx;
577 int spos = src_offset + sy * src_step + sx;
579 if(dx<dst_cols && dy<dst_rows)
580 dst[dpos] = src[spos];