1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Wu Xinglong, wxl370@126.com
19 // Sen Liu, swjtuls1987@126.com
20 // Peng Xiao, pengxiao@outlook.com
21 // Erping Pang, erping@multicorewareinc.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
25 // * Redistribution's of source code must retain the above copyright notice,
26 // this list of conditions and the following disclaimer.
28 // * Redistribution's in binary form must reproduce the above copyright notice,
29 // this list of conditions and the following disclaimer in the documentation
30 // and/or other materials provided with the distribution.
32 // * The name of the copyright holders may not be used to endorse or promote products
33 // derived from this software without specific prior written permission.
35 // This software is provided by the copyright holders and contributors as is and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the Intel Corporation or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
48 #define CV_HAAR_FEATURE_MAX 3
50 typedef float sqsumtype;
52 typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
54 int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
55 float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
56 float threshold /*__attribute__((aligned (4)))*/;
57 float alpha[3] __attribute__((aligned(16)));
58 int left __attribute__((aligned(4)));
59 int right __attribute__((aligned(4)));
62 typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
64 int count __attribute__((aligned(4)));
65 GpuHidHaarTreeNode *node __attribute__((aligned(8)));
66 float *alpha __attribute__((aligned(8)));
69 typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
71 int count __attribute__((aligned(4)));
72 float threshold __attribute__((aligned(4)));
73 int two_rects __attribute__((aligned(4)));
74 int reserved0 __attribute__((aligned(8)));
75 int reserved1 __attribute__((aligned(8)));
76 int reserved2 __attribute__((aligned(8)));
77 int reserved3 __attribute__((aligned(8)));
79 GpuHidHaarStageClassifier;
80 typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
82 int count __attribute__((aligned(4)));
83 int is_stump_based __attribute__((aligned(4)));
84 int has_tilted_features __attribute__((aligned(4)));
85 int is_tree __attribute__((aligned(4)));
86 int pq0 __attribute__((aligned(4)));
87 int pq1 __attribute__((aligned(4)));
88 int pq2 __attribute__((aligned(4)));
89 int pq3 __attribute__((aligned(4)));
90 int p0 __attribute__((aligned(4)));
91 int p1 __attribute__((aligned(4)));
92 int p2 __attribute__((aligned(4)));
93 int p3 __attribute__((aligned(4)));
94 float inv_window_area __attribute__((aligned(4)));
95 } GpuHidHaarClassifierCascade;
97 __kernel void gpuRunHaarClassifierCascade_scaled2(
98 global GpuHidHaarStageClassifier *stagecascadeptr,
100 global GpuHidHaarTreeNode *nodeptr,
101 global const int *restrict sum,
102 global const float *restrict sqsum,
103 global int4 *candidate,
108 const int start_stage,
109 const int split_stage,
113 global float *correction,
116 int grpszx = get_local_size(0);
117 int grpszy = get_local_size(1);
118 int grpnumx = get_num_groups(0);
119 int grpidx = get_group_id(0);
120 int lclidx = get_local_id(0);
121 int lclidy = get_local_id(1);
122 int lcl_id = mad24(lclidy, grpszx, lclidx);
123 __local int glboutindex[1];
124 __local int lclcount[1];
125 __local int lcloutindex[64];
127 int outputoff = mul24(grpidx, 256);
128 candidate[outputoff + (lcl_id << 2)] = (int4)0;
129 candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
130 candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
131 candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
132 int max_idx = rows * cols - 1;
133 for (int scalei = 0; scalei < loopcount; scalei++)
136 scaleinfo1 = info[scalei];
137 int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
138 int totalgrp = scaleinfo1.y & 0xffff;
139 float factor = as_float(scaleinfo1.w);
140 float correction_t = correction[scalei];
141 float ystep = max(2.0f, factor);
143 for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
145 int4 cascadeinfo = p[scalei];
146 int grpidy = grploop / grpnumperline;
147 int grpidx = grploop - mul24(grpidy, grpnumperline);
148 int ix = mad24(grpidx, grpszx, lclidx);
149 int iy = mad24(grpidy, grpszy, lclidy);
150 int x = round(ix * ystep);
151 int y = round(iy * ystep);
152 lcloutindex[lcl_id] = 0;
155 float mean, variance_norm_factor;
156 //if((ix < width) && (iy < height))
158 const int p_offset = mad24(y, step, x);
159 cascadeinfo.x += p_offset;
160 cascadeinfo.z += p_offset;
161 mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
162 - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
163 sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
164 + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
166 variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
167 - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
168 sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
169 + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
170 variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
171 variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
173 nodecounter = startnode + nodecount * scalei;
174 for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
176 float stage_sum = 0.f;
177 int stagecount = stagecascadeptr[stageloop].count;
178 for (int nodeloop = 0; nodeloop < stagecount;)
180 __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
181 int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
182 int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
183 int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
184 float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
185 float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
186 float nodethreshold = w.w * variance_norm_factor;
194 float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
195 - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
196 sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
197 + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
198 classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
199 - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
200 sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
201 + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
202 classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
203 - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
204 sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
205 + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
207 bool passThres = classsum >= nodethreshold;
210 stage_sum += passThres ? alpha3.y : alpha3.x;
214 bool isRootNode = (nodecounter & 1) == 0;
217 if( (passThres && currentnodeptr->right) ||
218 (!passThres && currentnodeptr->left))
224 stage_sum += alpha3.x;
231 stage_sum += (passThres ? alpha3.z : alpha3.y);
237 result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
240 barrier(CLK_LOCAL_MEM_FENCE);
244 int queueindex = atomic_inc(lclcount);
245 lcloutindex[queueindex] = (y << 16) | x;
247 barrier(CLK_LOCAL_MEM_FENCE);
248 int queuecount = lclcount[0];
250 if (lcl_id < queuecount)
252 int temp = lcloutindex[lcl_id];
253 int x = temp & 0xffff;
254 int y = (temp & (int)0xffff0000) >> 16;
255 temp = atomic_inc(glboutindex);
256 int4 candidate_result;
257 candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
258 candidate_result.x = x;
259 candidate_result.y = y;
261 int i = outputoff+temp+lcl_id;
262 if(candidate[i].z == 0)
264 candidate[i] = candidate_result;
270 if(candidate[i].z == 0)
272 candidate[i] = candidate_result;
279 barrier(CLK_LOCAL_MEM_FENCE);
284 __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
286 int counter = get_global_id(0);
287 int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
288 GpuHidHaarTreeNode t1 = *(orinode + counter);
291 for (i = 0; i < 3; i++)
293 tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
294 tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
295 tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
296 tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
299 t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
303 for (i = 0; i < 3; i++)
305 newnode[counter].p[i][0] = tr_x[i];
306 newnode[counter].p[i][1] = tr_y[i];
307 newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
308 newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
309 newnode[counter].weight[i] = t1.weight[i] * weight_scale;
312 newnode[counter].left = t1.left;
313 newnode[counter].right = t1.right;
314 newnode[counter].threshold = t1.threshold;
315 newnode[counter].alpha[0] = t1.alpha[0];
316 newnode[counter].alpha[1] = t1.alpha[1];
317 newnode[counter].alpha[2] = t1.alpha[2];