Merge pull request #1663 from vpisarev:ocl_experiments3
[profile/ivi/opencv.git] / modules / ocl / src / opencl / haarobjectdetect_scaled2.cl
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // @Authors
18 //    Wu Xinglong, wxl370@126.com
19 //    Sen Liu, swjtuls1987@126.com
20 //    Peng Xiao, pengxiao@outlook.com
21 // Redistribution and use in source and binary forms, with or without modification,
22 // are permitted provided that the following conditions are met:
23 //
24 //   * Redistribution's of source code must retain the above copyright notice,
25 //     this list of conditions and the following disclaimer.
26 //
27 //   * Redistribution's in binary form must reproduce the above copyright notice,
28 //     this list of conditions and the following disclaimer in the documentation
29 //     and/or other oclMaterials provided with the distribution.
30 //
31 //   * The name of the copyright holders may not be used to endorse or promote products
32 //     derived from this software without specific prior written permission.
33 //
34 // This software is provided by the copyright holders and contributors as is and
35 // any express or implied warranties, including, but not limited to, the implied
36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
37 // In no event shall the Intel Corporation or contributors be liable for any direct,
38 // indirect, incidental, special, exemplary, or consequential damages
39 // (including, but not limited to, procurement of substitute goods or services;
40 // loss of use, data, or profits; or business interruption) however caused
41 // and on any theory of liability, whether in contract, strict liability,
42 // or tort (including negligence or otherwise) arising in any way out of
43 // the use of this software, even if advised of the possibility of such damage.
44 //
45 //M*/
46
47 // Enter your kernel in this window
48 //#pragma OPENCL EXTENSION cl_amd_printf:enable
49 #define CV_HAAR_FEATURE_MAX           3
50 typedef int   sumtype;
51 typedef float sqsumtype;
52
53 typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
54 {
55     int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
56     float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
57     float threshold /*__attribute__((aligned (4)))*/;
58     float alpha[3] __attribute__((aligned(16)));
59     int left __attribute__((aligned(4)));
60     int right __attribute__((aligned(4)));
61 }
62 GpuHidHaarTreeNode;
63 typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
64 {
65     int count __attribute__((aligned(4)));
66     GpuHidHaarTreeNode *node __attribute__((aligned(8)));
67     float *alpha __attribute__((aligned(8)));
68 }
69 GpuHidHaarClassifier;
70 typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
71 {
72     int  count __attribute__((aligned(4)));
73     float threshold __attribute__((aligned(4)));
74     int two_rects __attribute__((aligned(4)));
75     int reserved0 __attribute__((aligned(8)));
76     int reserved1 __attribute__((aligned(8)));
77     int reserved2 __attribute__((aligned(8)));
78     int reserved3 __attribute__((aligned(8)));
79 }
80 GpuHidHaarStageClassifier;
81 typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
82 {
83     int  count __attribute__((aligned(4)));
84     int  is_stump_based __attribute__((aligned(4)));
85     int  has_tilted_features __attribute__((aligned(4)));
86     int  is_tree __attribute__((aligned(4)));
87     int pq0 __attribute__((aligned(4)));
88     int pq1 __attribute__((aligned(4)));
89     int pq2 __attribute__((aligned(4)));
90     int pq3 __attribute__((aligned(4)));
91     int p0 __attribute__((aligned(4)));
92     int p1 __attribute__((aligned(4)));
93     int p2 __attribute__((aligned(4)));
94     int p3 __attribute__((aligned(4)));
95     float inv_window_area __attribute__((aligned(4)));
96 } GpuHidHaarClassifierCascade;
97
98 __kernel void gpuRunHaarClassifierCascade_scaled2(
99     global GpuHidHaarStageClassifier *stagecascadeptr,
100     global int4 *info,
101     global GpuHidHaarTreeNode *nodeptr,
102     global const int *restrict sum,
103     global const float   *restrict sqsum,
104     global int4 *candidate,
105     const int rows,
106     const int cols,
107     const int step,
108     const int loopcount,
109     const int start_stage,
110     const int split_stage,
111     const int end_stage,
112     const int startnode,
113     global int4 *p,
114     global float *correction,
115     const int nodecount)
116 {
117     int grpszx = get_local_size(0);
118     int grpszy = get_local_size(1);
119     int grpnumx = get_num_groups(0);
120     int grpidx = get_group_id(0);
121     int lclidx = get_local_id(0);
122     int lclidy = get_local_id(1);
123     int lcl_sz = mul24(grpszx, grpszy);
124     int lcl_id = mad24(lclidy, grpszx, lclidx);
125     __local int glboutindex[1];
126     __local int lclcount[1];
127     __local int lcloutindex[64];
128     glboutindex[0] = 0;
129     int outputoff = mul24(grpidx, 256);
130     candidate[outputoff + (lcl_id << 2)] = (int4)0;
131     candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
132     candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
133     candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
134     int max_idx = rows * cols - 1;
135     for (int scalei = 0; scalei < loopcount; scalei++)
136     {
137         int4 scaleinfo1;
138         scaleinfo1 = info[scalei];
139         int width = (scaleinfo1.x & 0xffff0000) >> 16;
140         int height = scaleinfo1.x & 0xffff;
141         int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
142         int totalgrp = scaleinfo1.y & 0xffff;
143         float factor = as_float(scaleinfo1.w);
144         float correction_t = correction[scalei];
145         int ystep = (int)(max(2.0f, factor) + 0.5f);
146
147         for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
148         {
149             int4 cascadeinfo = p[scalei];
150             int grpidy = grploop / grpnumperline;
151             int grpidx = grploop - mul24(grpidy, grpnumperline);
152             int ix = mad24(grpidx, grpszx, lclidx);
153             int iy = mad24(grpidy, grpszy, lclidy);
154             int x = ix * ystep;
155             int y = iy * ystep;
156             lcloutindex[lcl_id] = 0;
157             lclcount[0] = 0;
158             int nodecounter;
159             float mean, variance_norm_factor;
160             //if((ix < width) && (iy < height))
161             {
162                 const int p_offset = mad24(y, step, x);
163                 cascadeinfo.x += p_offset;
164                 cascadeinfo.z += p_offset;
165                 mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
166                 - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
167                         sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
168                 + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
169                        * correction_t;
170                 variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
171                 - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
172                                        sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
173                 + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
174                 variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
175                 variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
176                 bool result = true;
177                 nodecounter = startnode + nodecount * scalei;
178                 for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
179                 {
180                     float stage_sum = 0.f;
181                     int   stagecount = stagecascadeptr[stageloop].count;
182                     for (int nodeloop = 0; nodeloop < stagecount;)
183                     {
184                         __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
185                         int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
186                         int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
187                         int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
188                         float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
189                         float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
190                         float nodethreshold  = w.w * variance_norm_factor;
191
192                         info1.x += p_offset;
193                         info1.z += p_offset;
194                         info2.x += p_offset;
195                         info2.z += p_offset;
196                         info3.x += p_offset;
197                         info3.z += p_offset;
198                         float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
199                         - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
200                                           sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
201                         + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
202                         classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
203                         - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
204                                      sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
205                         + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
206                         classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
207                         - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
208                                      sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
209                         + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
210
211                         bool passThres = classsum >= nodethreshold;
212
213 #if STUMP_BASED
214                         stage_sum += passThres ? alpha3.y : alpha3.x;
215                         nodecounter++;
216                         nodeloop++;
217 #else
218                         bool isRootNode = (nodecounter & 1) == 0;
219                         if(isRootNode)
220                         {
221                             if( (passThres && currentnodeptr->right) ||
222                                 (!passThres && currentnodeptr->left))
223                             {
224                                 nodecounter ++;
225                             }
226                             else
227                             {
228                                 stage_sum += alpha3.x;
229                                 nodecounter += 2;
230                                 nodeloop ++;
231                             }
232                         }
233                         else
234                         {
235                             stage_sum += (passThres ? alpha3.z : alpha3.y);
236                             nodecounter ++;
237                             nodeloop ++;
238                         }
239 #endif
240                     }
241                     result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
242                 }
243
244                 barrier(CLK_LOCAL_MEM_FENCE);
245
246                 if (result && (ix < width) && (iy < height))
247                 {
248                     int queueindex = atomic_inc(lclcount);
249                     lcloutindex[queueindex] = (y << 16) | x;
250                 }
251                 barrier(CLK_LOCAL_MEM_FENCE);
252                 int queuecount = lclcount[0];
253
254                 if (lcl_id < queuecount)
255                 {
256                     int temp = lcloutindex[lcl_id];
257                     int x = temp & 0xffff;
258                     int y = (temp & (int)0xffff0000) >> 16;
259                     temp = atomic_inc(glboutindex);
260                     int4 candidate_result;
261                     candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
262                     candidate_result.x = x;
263                     candidate_result.y = y;
264                     candidate[outputoff + temp + lcl_id] = candidate_result;
265                 }
266
267                 barrier(CLK_LOCAL_MEM_FENCE);
268             }
269         }
270     }
271 }
272 __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
273 {
274     int counter = get_global_id(0);
275     int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
276     GpuHidHaarTreeNode t1 = *(orinode + counter);
277 #pragma unroll
278
279     for (i = 0; i < 3; i++)
280     {
281         tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
282         tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
283         tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
284         tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
285     }
286
287     t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
288     counter += nodenum;
289 #pragma unroll
290
291     for (i = 0; i < 3; i++)
292     {
293         newnode[counter].p[i][0] = tr_x[i];
294         newnode[counter].p[i][1] = tr_y[i];
295         newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
296         newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
297         newnode[counter].weight[i] = t1.weight[i] * weight_scale;
298     }
299
300     newnode[counter].left = t1.left;
301     newnode[counter].right = t1.right;
302     newnode[counter].threshold = t1.threshold;
303     newnode[counter].alpha[0] = t1.alpha[0];
304     newnode[counter].alpha[1] = t1.alpha[1];
305     newnode[counter].alpha[2] = t1.alpha[2];
306 }