X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=documentation%2Fhog_8cl_source.xhtml;h=0de0a8412679b810e57ead157df8d9a8f0b577b6;hb=HEAD;hp=3428a0e9e3b27aa68e4e1e98a32ce505fa215e7f;hpb=8938bd3f40ea62ff56d6ed4e2db0a8aee34dd64a;p=platform%2Fupstream%2Farmcl.git diff --git a/documentation/hog_8cl_source.xhtml b/documentation/hog_8cl_source.xhtml index 3428a0e..0de0a84 100644 --- a/documentation/hog_8cl_source.xhtml +++ b/documentation/hog_8cl_source.xhtml @@ -4,7 +4,7 @@ - + Compute Library: src/core/CL/cl_kernels/hog.cl Source File @@ -12,22 +12,24 @@ + + + @@ -38,7 +40,7 @@
Compute Library -  17.09 +  18.05
@@ -46,7 +48,7 @@ - + @@ -101,7 +103,7 @@ $(document).ready(function(){initNavTree('hog_8cl_source.xhtml','');}); onmouseover="return searchBox.OnSearchSelectShow()" onmouseout="return searchBox.OnSearchSelectHide()" onkeydown="return searchBox.OnSearchSelectKey(event)"> - All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages +
@@ -115,397 +117,33 @@ $(document).ready(function(){initNavTree('hog_8cl_source.xhtml','');});
hog.cl
-Go to the documentation of this file.
1 /*
-
2  * Copyright (c) 2017 ARM Limited.
-
3  *
-
4  * SPDX-License-Identifier: MIT
-
5  *
-
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
-
7  * of this software and associated documentation files (the "Software"), to
-
8  * deal in the Software without restriction, including without limitation the
-
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-
10  * sell copies of the Software, and to permit persons to whom the Software is
-
11  * furnished to do so, subject to the following conditions:
-
12  *
-
13  * The above copyright notice and this permission notice shall be included in all
-
14  * copies or substantial portions of the Software.
-
15  *
-
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-
22  * SOFTWARE.
-
23  */
-
24 #include "helpers.h"
-
25 #include "types.h"
-
26 
-
27 #if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
-
28 
-
59 __kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
-
60  IMAGE_DECLARATION(phase),
-
61  IMAGE_DECLARATION(dst))
-
62 {
-
63  float bins[NUM_BINS] = { 0 };
-
64 
-
65  // Compute address for the magnitude and phase images
-
66  Image mag = CONVERT_TO_IMAGE_STRUCT(mag);
-
67  Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
-
68 
-
69  __global uchar *mag_row_ptr = mag.ptr;
-
70  __global uchar *phase_row_ptr = phase.ptr;
-
71 
-
72  for(int yc = 0; yc < CELL_HEIGHT; ++yc)
-
73  {
-
74  int xc = 0;
-
75  for(; xc <= (CELL_WIDTH - 4); xc += 4)
-
76  {
-
77  // Load magnitude and phase values
-
78  const float4 mag_f32 = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
-
79  float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
-
80 
-
81  // Scale phase: phase * scale + 0.5f
-
82  phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
-
83 
-
84  // Compute histogram index.
-
85  int4 hidx_s32 = convert_int4(phase_f32);
-
86 
-
87  // Compute magnitude weights (w0 and w1)
-
88  const float4 hidx_f32 = convert_float4(hidx_s32);
-
89 
-
90  // w1 = phase_f32 - hidx_s32
-
91  const float4 w1_f32 = phase_f32 - hidx_f32;
-
92 
-
93  // w0 = 1.0 - w1
-
94  const float4 w0_f32 = (float4)1.0f - w1_f32;
-
95 
-
96  // Calculate the weights for splitting vote
-
97  const float4 mag_w0_f32 = mag_f32 * w0_f32;
-
98  const float4 mag_w1_f32 = mag_f32 * w1_f32;
-
99 
-
100  // Weighted vote between 2 bins
-
101 
-
102  // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-
103  hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
104 
-
105  // Bin 0
-
106  bins[hidx_s32.s0] += mag_w0_f32.s0;
-
107  bins[hidx_s32.s1] += mag_w0_f32.s1;
-
108  bins[hidx_s32.s2] += mag_w0_f32.s2;
-
109  bins[hidx_s32.s3] += mag_w0_f32.s3;
-
110 
-
111  hidx_s32 += (int4)1;
-
112 
-
113  // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-
114  hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
115 
-
116  // Bin1
-
117  bins[hidx_s32.s0] += mag_w1_f32.s0;
-
118  bins[hidx_s32.s1] += mag_w1_f32.s1;
-
119  bins[hidx_s32.s2] += mag_w1_f32.s2;
-
120  bins[hidx_s32.s3] += mag_w1_f32.s3;
-
121  }
-
122 
-
123  // Left over computation
-
124  for(; xc < CELL_WIDTH; xc++)
-
125  {
-
126  const float mag_value = *((__global short *)mag_row_ptr + xc);
-
127  const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
-
128  const float w1 = phase_value - floor(phase_value);
-
129 
-
130  // The quantised phase is the histogram index [0, NUM_BINS - 1]
-
131  // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
-
132  const uint hidx = (uint)(phase_value) % NUM_BINS;
-
133 
-
134  // Weighted vote between 2 bins
-
135  bins[hidx] += mag_value * (1.0f - w1);
-
136  bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
-
137  }
-
138 
-
139  // Point to the next row of magnitude and phase images
-
140  mag_row_ptr += mag_stride_y;
-
141  phase_row_ptr += phase_stride_y;
-
142  }
-
143 
-
144  // Compute address for the destination image
-
145  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
146 
-
147  // Store the local HOG in the global memory
-
148  int xc = 0;
-
149  for(; xc <= (NUM_BINS - 4); xc += 4)
-
150  {
-
151  float4 values = vload4(0, bins + xc);
-
152 
-
153  vstore4(values, 0, ((__global float *)dst.ptr) + xc);
-
154  }
-
155 
-
156  // Left over stores
-
157  for(; xc < NUM_BINS; ++xc)
-
158  {
-
159  ((__global float *)dst.ptr)[xc] = bins[xc];
-
160  }
-
161 }
-
162 #endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
-
163 
-
164 #if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
-
165 
-
166 #ifndef L2_NORM
-
167 #error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
-
168 #endif /* not L2_NORM */
-
169 
-
170 #ifndef L2HYS_NORM
-
171 #error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
-
172 #endif /* not L2HYS_NORM */
-
173 
-
174 #ifndef L1_NORM
-
175 #error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
-
176 #endif /* not L1_NORM */
-
177 
-
206 __kernel void hog_block_normalization(IMAGE_DECLARATION(src),
-
207  IMAGE_DECLARATION(dst))
-
208 {
-
209  float sum = 0.0f;
-
210  float4 sum_f32 = (float4)(0.0f);
-
211 
-
212  // Compute address for the source and destination tensor
-
213  Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
214  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
215 
-
216  for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
-
217  {
-
218  const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
-
219 
-
220  int xc = 0;
-
221  for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
-
222  {
-
223  const float4 val0 = vload4(0, hist_ptr + xc + 0);
-
224  const float4 val1 = vload4(0, hist_ptr + xc + 4);
-
225  const float4 val2 = vload4(0, hist_ptr + xc + 8);
-
226  const float4 val3 = vload4(0, hist_ptr + xc + 12);
-
227 
-
228 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-
229  // Compute val^2 for L2_NORM or L2HYS_NORM
-
230  sum_f32 += val0 * val0;
-
231  sum_f32 += val1 * val1;
-
232  sum_f32 += val2 * val2;
-
233  sum_f32 += val3 * val3;
-
234 #else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
235  // Compute |val| for L1_NORM
-
236  sum_f32 += fabs(val0);
-
237  sum_f32 += fabs(val1);
-
238  sum_f32 += fabs(val2);
-
239  sum_f32 += fabs(val3);
-
240 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
241 
-
242  // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
-
243  // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
-
244  // will be accessed consecutively
-
245  vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
-
246  vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
-
247  vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
-
248  vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
-
249  }
-
250 
-
251  // Compute left over
-
252  for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
-
253  {
-
254  const float val = hist_ptr[xc];
-
255 
-
256 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-
257  sum += val * val;
-
258 #else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
259  sum += fabs(val);
-
260 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
261 
-
262  ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
-
263  }
-
264  }
-
265 
-
266  sum += dot(sum_f32, (float4)1.0f);
-
267 
-
268  float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
-
269 
-
270 #if(HOG_NORM_TYPE == L2HYS_NORM)
-
271  // Reset sum
-
272  sum_f32 = (float4)0.0f;
-
273  sum = 0.0f;
-
274 
-
275  int k = 0;
-
276  for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
-
277  {
-
278  float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
-
279  float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
-
280  float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
-
281  float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
-
282 
-
283  // Scale val
-
284  val0 = val0 * (float4)scale;
-
285  val1 = val1 * (float4)scale;
-
286  val2 = val2 * (float4)scale;
-
287  val3 = val3 * (float4)scale;
-
288 
-
289  // Clip val if over _threshold_l2hys
-
290  val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
-
291  val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
-
292  val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
-
293  val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
-
294 
-
295  // Compute val^2
-
296  sum_f32 += val0 * val0;
-
297  sum_f32 += val1 * val1;
-
298  sum_f32 += val2 * val2;
-
299  sum_f32 += val3 * val3;
-
300 
-
301  vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
-
302  vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
-
303  vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
-
304  vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
-
305  }
-
306 
-
307  // Compute left over
-
308  for(; k < NUM_BINS_PER_BLOCK; ++k)
-
309  {
-
310  float val = ((__global float *)dst.ptr)[k] * scale;
-
311 
-
312  // Clip scaled input_value if over L2_HYST_THRESHOLD
-
313  val = fmin(val, (float)L2_HYST_THRESHOLD);
-
314 
-
315  sum += val * val;
-
316 
-
317  ((__global float *)dst.ptr)[k] = val;
-
318  }
-
319 
-
320  sum += dot(sum_f32, (float4)1.0f);
-
321 
-
322  // We use the same constants of OpenCV
-
323  scale = 1.0f / (sqrt(sum) + 1e-3f);
-
324 
-
325 #endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
-
326 
-
327  int i = 0;
-
328  for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
-
329  {
-
330  float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
-
331  float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
-
332  float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
-
333  float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
-
334 
-
335  // Multiply val by the normalization scale factor
-
336  val0 = val0 * (float4)scale;
-
337  val1 = val1 * (float4)scale;
-
338  val2 = val2 * (float4)scale;
-
339  val3 = val3 * (float4)scale;
-
340 
-
341  vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
-
342  vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
-
343  vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
-
344  vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
-
345  }
-
346 
-
347  for(; i < NUM_BINS_PER_BLOCK; ++i)
-
348  {
-
349  ((__global float *)dst.ptr)[i] *= scale;
-
350  }
-
351 }
-
352 #endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
-
353 
-
354 #if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(BLOCK_STRIDE_WIDTH) && defined(BLOCK_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
-
355 
-
382 __kernel void hog_detector(IMAGE_DECLARATION(src),
-
383  __global float *hog_descriptor,
-
384  __global DetectionWindow *dst,
-
385  __global uint *num_detection_windows)
-
386 {
-
387  // Check if the DetectionWindow array is full
-
388  if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
-
389  {
-
390  return;
-
391  }
-
392 
-
393  Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
394 
-
395  const int src_step_y_f32 = src_stride_y / sizeof(float);
-
396 
-
397  // Init score_f32 with 0
-
398  float4 score_f32 = (float4)0.0f;
-
399 
-
400  // Init score with 0
-
401  float score = 0.0f;
-
402 
-
403  __global float *src_row_ptr = (__global float *)src.ptr;
-
404 
-
405  // Compute Linear SVM
-
406  for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
-
407  {
-
408  int xb = 0;
-
409 
-
410  const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
-
411 
-
412  for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
-
413  {
-
414  // Load descriptor values
-
415  float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
-
416  float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
-
417 
-
418  float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
-
419  float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
-
420 
-
421  // Multiply accumulate
-
422  score_f32 += a0_f32 * b0_f32;
-
423  score_f32 += a1_f32 * b1_f32;
-
424  }
-
425 
-
426  for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
-
427  {
-
428  const float a = src_row_ptr[xb];
-
429  const float b = hog_descriptor[xb + offset_y];
-
430 
-
431  score += a * b;
-
432  }
-
433  }
-
434 
-
435  score += dot(score_f32, (float4)1.0f);
-
436 
-
437  // Add the bias. The bias is located at the position (descriptor_size() - 1)
-
438  // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
-
439  score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
-
440 
-
441  if(score > (float)THRESHOLD)
-
442  {
-
443  int id = atomic_inc(num_detection_windows);
-
444  if(id < MAX_NUM_DETECTION_WINDOWS)
-
445  {
-
446  dst[id].x = get_global_id(0) * BLOCK_STRIDE_WIDTH;
-
447  dst[id].y = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
-
448  dst[id].width = DETECTION_WINDOW_WIDTH;
-
449  dst[id].height = DETECTION_WINDOW_HEIGHT;
-
450  dst[id].idx_class = IDX_CLASS;
-
451  dst[id].score = score;
-
452  }
-
453  }
-
454 }
-
455 #endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
-
456  * BLOCK_STRIDE_WIDTH && BLOCK_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
-
uint32_t id
Definition: hwc.hpp:252
+Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2018 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "helpers.h"
25 #include "types.h"
26 
27 #if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
28 
62 {
63  float bins[NUM_BINS] = { 0 };
64 
65  // Compute address for the magnitude and phase images
66  Image mag = CONVERT_TO_IMAGE_STRUCT(mag);
68 
69  __global uchar *mag_row_ptr = mag.ptr;
70  __global uchar *phase_row_ptr = phase.ptr;
71 
72  for(int yc = 0; yc < CELL_HEIGHT; ++yc)
73  {
74  int xc = 0;
75  for(; xc <= (CELL_WIDTH - 4); xc += 4)
76  {
77  // Load magnitude and phase values
78  const float4 mag_f32 = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
79  float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
80 
81  // Scale phase: phase * scale + 0.5f
82  phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
83 
84  // Compute histogram index.
85  int4 hidx_s32 = convert_int4(phase_f32);
86 
87  // Compute magnitude weights (w0 and w1)
88  const float4 hidx_f32 = convert_float4(hidx_s32);
89 
90  // w1 = phase_f32 - hidx_s32
91  const float4 w1_f32 = phase_f32 - hidx_f32;
92 
93  // w0 = 1.0 - w1
94  const float4 w0_f32 = (float4)1.0f - w1_f32;
95 
96  // Calculate the weights for splitting vote
97  const float4 mag_w0_f32 = mag_f32 * w0_f32;
98  const float4 mag_w1_f32 = mag_f32 * w1_f32;
99 
100  // Weighted vote between 2 bins
101 
102  // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
103  hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
104 
105  // Bin 0
106  bins[hidx_s32.s0] += mag_w0_f32.s0;
107  bins[hidx_s32.s1] += mag_w0_f32.s1;
108  bins[hidx_s32.s2] += mag_w0_f32.s2;
109  bins[hidx_s32.s3] += mag_w0_f32.s3;
110 
111  hidx_s32 += (int4)1;
112 
113  // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
114  hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
115 
116  // Bin1
117  bins[hidx_s32.s0] += mag_w1_f32.s0;
118  bins[hidx_s32.s1] += mag_w1_f32.s1;
119  bins[hidx_s32.s2] += mag_w1_f32.s2;
120  bins[hidx_s32.s3] += mag_w1_f32.s3;
121  }
122 
123  // Left over computation
124  for(; xc < CELL_WIDTH; xc++)
125  {
126  const float mag_value = *((__global short *)mag_row_ptr + xc);
127  const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
128  const float w1 = phase_value - floor(phase_value);
129 
130  // The quantised phase is the histogram index [0, NUM_BINS - 1]
131  // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
132  const uint hidx = (uint)(phase_value) % NUM_BINS;
133 
134  // Weighted vote between 2 bins
135  bins[hidx] += mag_value * (1.0f - w1);
136  bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
137  }
138 
139  // Point to the next row of magnitude and phase images
140  mag_row_ptr += mag_stride_y;
141  phase_row_ptr += phase_stride_y;
142  }
143 
144  // Compute address for the destination image
146 
147  // Store the local HOG in the global memory
148  int xc = 0;
149  for(; xc <= (NUM_BINS - 4); xc += 4)
150  {
151  float4 values = vload4(0, bins + xc);
152 
153  vstore4(values, 0, ((__global float *)dst.ptr) + xc);
154  }
155 
156  // Left over stores
157  for(; xc < NUM_BINS; ++xc)
158  {
159  ((__global float *)dst.ptr)[xc] = bins[xc];
160  }
161 }
162 #endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
163 
164 #if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
165 
166 #ifndef L2_NORM
167 #error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
168 #endif /* not L2_NORM */
169 
170 #ifndef L2HYS_NORM
171 #error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
172 #endif /* not L2HYS_NORM */
173 
174 #ifndef L1_NORM
175 #error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
176 #endif /* not L1_NORM */
177 
207  IMAGE_DECLARATION(dst))
208 {
209  float sum = 0.0f;
210  float4 sum_f32 = (float4)(0.0f);
211 
212  // Compute address for the source and destination tensor
214  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
215 
216  for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
217  {
218  const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
219 
220  int xc = 0;
221  for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
222  {
223  const float4 val0 = vload4(0, hist_ptr + xc + 0);
224  const float4 val1 = vload4(0, hist_ptr + xc + 4);
225  const float4 val2 = vload4(0, hist_ptr + xc + 8);
226  const float4 val3 = vload4(0, hist_ptr + xc + 12);
227 
228 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
229  // Compute val^2 for L2_NORM or L2HYS_NORM
230  sum_f32 += val0 * val0;
231  sum_f32 += val1 * val1;
232  sum_f32 += val2 * val2;
233  sum_f32 += val3 * val3;
234 #else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
235  // Compute |val| for L1_NORM
236  sum_f32 += fabs(val0);
237  sum_f32 += fabs(val1);
238  sum_f32 += fabs(val2);
239  sum_f32 += fabs(val3);
240 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
241 
242  // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
243  // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
244  // will be accessed consecutively
245  vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
246  vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
247  vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
248  vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
249  }
250 
251  // Compute left over
252  for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
253  {
254  const float val = hist_ptr[xc];
255 
256 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
257  sum += val * val;
258 #else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
259  sum += fabs(val);
260 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
261 
262  ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
263  }
264  }
265 
266  sum += dot(sum_f32, (float4)1.0f);
267 
268  float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
269 
270 #if(HOG_NORM_TYPE == L2HYS_NORM)
271  // Reset sum
272  sum_f32 = (float4)0.0f;
273  sum = 0.0f;
274 
275  int k = 0;
276  for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
277  {
278  float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
279  float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
280  float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
281  float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
282 
283  // Scale val
284  val0 = val0 * (float4)scale;
285  val1 = val1 * (float4)scale;
286  val2 = val2 * (float4)scale;
287  val3 = val3 * (float4)scale;
288 
289  // Clip val if over _threshold_l2hys
290  val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
291  val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
292  val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
293  val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
294 
295  // Compute val^2
296  sum_f32 += val0 * val0;
297  sum_f32 += val1 * val1;
298  sum_f32 += val2 * val2;
299  sum_f32 += val3 * val3;
300 
301  vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
302  vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
303  vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
304  vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
305  }
306 
307  // Compute left over
308  for(; k < NUM_BINS_PER_BLOCK; ++k)
309  {
310  float val = ((__global float *)dst.ptr)[k] * scale;
311 
312  // Clip scaled input_value if over L2_HYST_THRESHOLD
313  val = fmin(val, (float)L2_HYST_THRESHOLD);
314 
315  sum += val * val;
316 
317  ((__global float *)dst.ptr)[k] = val;
318  }
319 
320  sum += dot(sum_f32, (float4)1.0f);
321 
322  // We use the same constants of OpenCV
323  scale = 1.0f / (sqrt(sum) + 1e-3f);
324 
325 #endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
326 
327  int i = 0;
328  for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
329  {
330  float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
331  float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
332  float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
333  float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
334 
335  // Multiply val by the normalization scale factor
336  val0 = val0 * (float4)scale;
337  val1 = val1 * (float4)scale;
338  val2 = val2 * (float4)scale;
339  val3 = val3 * (float4)scale;
340 
341  vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
342  vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
343  vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
344  vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
345  }
346 
347  for(; i < NUM_BINS_PER_BLOCK; ++i)
348  {
349  ((__global float *)dst.ptr)[i] *= scale;
350  }
351 }
352 #endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
353 
354 #if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
355 
382 __kernel void hog_detector(IMAGE_DECLARATION(src),
383  __global float *hog_descriptor,
384  __global DetectionWindow *dst,
385  __global uint *num_detection_windows)
386 {
387  // Check if the DetectionWindow array is full
388  if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
389  {
390  return;
391  }
392 
393  Image src = CONVERT_TO_IMAGE_STRUCT(src);
394 
395  const int src_step_y_f32 = src_stride_y / sizeof(float);
396 
397  // Init score_f32 with 0
398  float4 score_f32 = (float4)0.0f;
399 
400  // Init score with 0
401  float score = 0.0f;
402 
403  __global float *src_row_ptr = (__global float *)src.ptr;
404 
405  // Compute Linear SVM
406  for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
407  {
408  int xb = 0;
409 
410  const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
411 
412  for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
413  {
414  // Load descriptor values
415  float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
416  float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
417 
418  float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
419  float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
420 
421  // Multiply accumulate
422  score_f32 += a0_f32 * b0_f32;
423  score_f32 += a1_f32 * b1_f32;
424  }
425 
426  for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
427  {
428  const float a = src_row_ptr[xb];
429  const float b = hog_descriptor[xb + offset_y];
430 
431  score += a * b;
432  }
433  }
434 
435  score += dot(score_f32, (float4)1.0f);
436 
437  // Add the bias. The bias is located at the position (descriptor_size() - 1)
438  // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
439  score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
440 
441  if(score > (float)THRESHOLD)
442  {
443  int id = atomic_inc(num_detection_windows);
444  if(id < MAX_NUM_DETECTION_WINDOWS)
445  {
446  dst[id].x = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
447  dst[id].y = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
448  dst[id].width = DETECTION_WINDOW_WIDTH;
449  dst[id].height = DETECTION_WINDOW_HEIGHT;
450  dst[id].idx_class = IDX_CLASS;
451  dst[id].score = score;
452  }
453  }
454 }
455 #endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
456  * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
Detection window struct.
Definition: types.h:47
+
void hog_orientation_binning(const SimpleTensor< T > &mag, const SimpleTensor< U > &phase, SimpleTensor< V > &hog_space, const HOGInfo &hog_info)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
-
#define IMAGE_DECLARATION(name)
Definition: helpers.h:57
+
SimpleTensor< uint8_t > phase(const SimpleTensor< T > &gx, const SimpleTensor< T > &gy, PhaseType phase_type)
Definition: Phase.cpp:35
+
#define IMAGE_DECLARATION(name)
Definition: helpers.h:68
+
SimpleTensor< T > hog_descriptor(const SimpleTensor< U > &src, BorderMode border_mode, U constant_border_value, const HOGInfo &hog_info)
-
#define CONVERT_TO_IMAGE_STRUCT(name)
Definition: helpers.h:93
-
Structure to hold Image information.
Definition: helpers.h:131
+
#define CONVERT_TO_IMAGE_STRUCT(name)
Definition: helpers.h:104
+ +
Structure to hold Image information.
Definition: helpers.h:142
-
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:133
-
SimpleTensor< T > scale(const SimpleTensor< T > &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value)
Definition: Scale.cpp:39
+
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:144
+
std::vector< DetectionWindow > hog_detector(const SimpleTensor< T > &src, const std::vector< T > &descriptor, unsigned int max_num_detection_windows, const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
Definition: HOGDetector.cpp:48
+ + +
convolution configure & src
+
void hog_block_normalization(SimpleTensor< T > &desc, const SimpleTensor< T > &hog_space, const HOGInfo &hog_info)