Merge remote-tracking branch 'origin/2.4' into merge-2.4
[profile/ivi/opencv.git] / modules / ocl / src / opencl / haarobjectdetect_scaled2.cl
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // @Authors
18 //    Wu Xinglong, wxl370@126.com
19 //    Sen Liu, swjtuls1987@126.com
20 //    Peng Xiao, pengxiao@outlook.com
21 //    Erping Pang, erping@multicorewareinc.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
24 //
25 //   * Redistribution's of source code must retain the above copyright notice,
26 //     this list of conditions and the following disclaimer.
27 //
28 //   * Redistribution's in binary form must reproduce the above copyright notice,
29 //     this list of conditions and the following disclaimer in the documentation
30 //     and/or other materials provided with the distribution.
31 //
32 //   * The name of the copyright holders may not be used to endorse or promote products
33 //     derived from this software without specific prior written permission.
34 //
35 // This software is provided by the copyright holders and contributors as is and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the Intel Corporation or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
45 //
46 //M*/
47
48 #define CV_HAAR_FEATURE_MAX           3
49 typedef int   sumtype;
50 typedef float sqsumtype;
51
52 typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
53 {
54     int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
55     float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
56     float threshold /*__attribute__((aligned (4)))*/;
57     float alpha[3] __attribute__((aligned(16)));
58     int left __attribute__((aligned(4)));
59     int right __attribute__((aligned(4)));
60 }
61 GpuHidHaarTreeNode;
62 typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
63 {
64     int count __attribute__((aligned(4)));
65     GpuHidHaarTreeNode *node __attribute__((aligned(8)));
66     float *alpha __attribute__((aligned(8)));
67 }
68 GpuHidHaarClassifier;
69 typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
70 {
71     int  count __attribute__((aligned(4)));
72     float threshold __attribute__((aligned(4)));
73     int two_rects __attribute__((aligned(4)));
74     int reserved0 __attribute__((aligned(8)));
75     int reserved1 __attribute__((aligned(8)));
76     int reserved2 __attribute__((aligned(8)));
77     int reserved3 __attribute__((aligned(8)));
78 }
79 GpuHidHaarStageClassifier;
80 typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
81 {
82     int  count __attribute__((aligned(4)));
83     int  is_stump_based __attribute__((aligned(4)));
84     int  has_tilted_features __attribute__((aligned(4)));
85     int  is_tree __attribute__((aligned(4)));
86     int pq0 __attribute__((aligned(4)));
87     int pq1 __attribute__((aligned(4)));
88     int pq2 __attribute__((aligned(4)));
89     int pq3 __attribute__((aligned(4)));
90     int p0 __attribute__((aligned(4)));
91     int p1 __attribute__((aligned(4)));
92     int p2 __attribute__((aligned(4)));
93     int p3 __attribute__((aligned(4)));
94     float inv_window_area __attribute__((aligned(4)));
95 } GpuHidHaarClassifierCascade;
96
97 __kernel void gpuRunHaarClassifierCascade_scaled2(
98     global GpuHidHaarStageClassifier *stagecascadeptr,
99     global int4 *info,
100     global GpuHidHaarTreeNode *nodeptr,
101     global const int *restrict sum,
102     global const float   *restrict sqsum,
103     global int4 *candidate,
104     const int rows,
105     const int cols,
106     const int step,
107     const int loopcount,
108     const int start_stage,
109     const int split_stage,
110     const int end_stage,
111     const int startnode,
112     global int4 *p,
113     global float *correction,
114     const int nodecount)
115 {
116     int grpszx = get_local_size(0);
117     int grpszy = get_local_size(1);
118     int grpnumx = get_num_groups(0);
119     int grpidx = get_group_id(0);
120     int lclidx = get_local_id(0);
121     int lclidy = get_local_id(1);
122     int lcl_id = mad24(lclidy, grpszx, lclidx);
123     __local int glboutindex[1];
124     __local int lclcount[1];
125     __local int lcloutindex[64];
126     glboutindex[0] = 0;
127     int outputoff = mul24(grpidx, 256);
128     candidate[outputoff + (lcl_id << 2)] = (int4)0;
129     candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
130     candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
131     candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
132     int max_idx = rows * cols - 1;
133     for (int scalei = 0; scalei < loopcount; scalei++)
134     {
135         int4 scaleinfo1;
136         scaleinfo1 = info[scalei];
137         int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
138         int totalgrp = scaleinfo1.y & 0xffff;
139         float factor = as_float(scaleinfo1.w);
140         float correction_t = correction[scalei];
141         float ystep = max(2.0f, factor);
142
143         for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
144         {
145             int4 cascadeinfo = p[scalei];
146             int grpidy = grploop / grpnumperline;
147             int grpidx = grploop - mul24(grpidy, grpnumperline);
148             int ix = mad24(grpidx, grpszx, lclidx);
149             int iy = mad24(grpidy, grpszy, lclidy);
150             int x = round(ix * ystep);
151             int y = round(iy * ystep);
152             lcloutindex[lcl_id] = 0;
153             lclcount[0] = 0;
154             int nodecounter;
155             float mean, variance_norm_factor;
156             //if((ix < width) && (iy < height))
157             {
158                 const int p_offset = mad24(y, step, x);
159                 cascadeinfo.x += p_offset;
160                 cascadeinfo.z += p_offset;
161                 mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
162                 - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
163                         sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
164                 + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
165                        * correction_t;
166                 variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
167                 - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
168                                        sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
169                 + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
170                 variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
171                 variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
172                 bool result = true;
173                 nodecounter = startnode + nodecount * scalei;
174                 for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
175                 {
176                     float stage_sum = 0.f;
177                     int   stagecount = stagecascadeptr[stageloop].count;
178                     for (int nodeloop = 0; nodeloop < stagecount;)
179                     {
180                         __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
181                         int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
182                         int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
183                         int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
184                         float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
185                         float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
186                         float nodethreshold  = w.w * variance_norm_factor;
187
188                         info1.x += p_offset;
189                         info1.z += p_offset;
190                         info2.x += p_offset;
191                         info2.z += p_offset;
192                         info3.x += p_offset;
193                         info3.z += p_offset;
194                         float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
195                         - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
196                                           sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
197                         + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
198                         classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
199                         - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
200                                      sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
201                         + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
202                         classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
203                         - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
204                                      sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
205                         + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
206
207                         bool passThres = classsum >= nodethreshold;
208
209 #if STUMP_BASED
210                         stage_sum += passThres ? alpha3.y : alpha3.x;
211                         nodecounter++;
212                         nodeloop++;
213 #else
214                         bool isRootNode = (nodecounter & 1) == 0;
215                         if(isRootNode)
216                         {
217                             if( (passThres && currentnodeptr->right) ||
218                                 (!passThres && currentnodeptr->left))
219                             {
220                                 nodecounter ++;
221                             }
222                             else
223                             {
224                                 stage_sum += alpha3.x;
225                                 nodecounter += 2;
226                                 nodeloop ++;
227                             }
228                         }
229                         else
230                         {
231                             stage_sum += (passThres ? alpha3.z : alpha3.y);
232                             nodecounter ++;
233                             nodeloop ++;
234                         }
235 #endif
236                     }
237                     result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
238                 }
239
240                 barrier(CLK_LOCAL_MEM_FENCE);
241
242                 if (result)
243                 {
244                     int queueindex = atomic_inc(lclcount);
245                     lcloutindex[queueindex] = (y << 16) | x;
246                 }
247                 barrier(CLK_LOCAL_MEM_FENCE);
248                 int queuecount = lclcount[0];
249
250                 if (lcl_id < queuecount)
251                 {
252                     int temp = lcloutindex[lcl_id];
253                     int x = temp & 0xffff;
254                     int y = (temp & (int)0xffff0000) >> 16;
255                     temp = atomic_inc(glboutindex);
256                     int4 candidate_result;
257                     candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
258                     candidate_result.x = x;
259                     candidate_result.y = y;
260
261                     int i = outputoff+temp+lcl_id;
262                     if(candidate[i].z == 0)
263                     {
264                         candidate[i] = candidate_result;
265                     }
266                     else
267                     {
268                         for(i=i+1;;i++)
269                         {
270                             if(candidate[i].z == 0)
271                             {
272                                 candidate[i] = candidate_result;
273                                 break;
274                             }
275                         }
276                     }
277                 }
278
279                 barrier(CLK_LOCAL_MEM_FENCE);
280             }
281         }
282     }
283 }
284 __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
285 {
286     int counter = get_global_id(0);
287     int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
288     GpuHidHaarTreeNode t1 = *(orinode + counter);
289
290     #pragma unroll
291     for (i = 0; i < 3; i++)
292     {
293         tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
294         tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
295         tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
296         tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
297     }
298
299     t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
300     counter += nodenum;
301
302     #pragma unroll
303     for (i = 0; i < 3; i++)
304     {
305         newnode[counter].p[i][0] = tr_x[i];
306         newnode[counter].p[i][1] = tr_y[i];
307         newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
308         newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
309         newnode[counter].weight[i] = t1.weight[i] * weight_scale;
310     }
311
312     newnode[counter].left = t1.left;
313     newnode[counter].right = t1.right;
314     newnode[counter].threshold = t1.threshold;
315     newnode[counter].alpha[0] = t1.alpha[0];
316     newnode[counter].alpha[1] = t1.alpha[1];
317     newnode[counter].alpha[2] = t1.alpha[2];
318 }