modules/ocl/src/opencl/haarobjectdetect_scaled2.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Wu Xinglong, wxl370@126.com
  19 //    Sen Liu, swjtuls1987@126.com
  20 //    Peng Xiao, pengxiao@outlook.com
  21 //    Erping Pang, erping@multicorewareinc.com
  22 // Redistribution and use in source and binary forms, with or without modification,
  23 // are permitted provided that the following conditions are met:
  24 //
  25 //   * Redistribution's of source code must retain the above copyright notice,
  26 //     this list of conditions and the following disclaimer.
  27 //
  28 //   * Redistribution's in binary form must reproduce the above copyright notice,
  29 //     this list of conditions and the following disclaimer in the documentation
  30 //     and/or other materials provided with the distribution.
  31 //
  32 //   * The name of the copyright holders may not be used to endorse or promote products
  33 //     derived from this software without specific prior written permission.
  34 //
  35 // This software is provided by the copyright holders and contributors as is and
  36 // any express or implied warranties, including, but not limited to, the implied
  37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  38 // In no event shall the Intel Corporation or contributors be liable for any direct,
  39 // indirect, incidental, special, exemplary, or consequential damages
  40 // (including, but not limited to, procurement of substitute goods or services;
  41 // loss of use, data, or profits; or business interruption) however caused
  42 // and on any theory of liability, whether in contract, strict liability,
  43 // or tort (including negligence or otherwise) arising in any way out of
  44 // the use of this software, even if advised of the possibility of such damage.
  45 //
  46 //M*/
  47
  48 #define CV_HAAR_FEATURE_MAX           3
  49 typedef int   sumtype;
  50 typedef float sqsumtype;
  51
  52 typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
  53 {
  54     int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
  55     float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
  56     float threshold /*__attribute__((aligned (4)))*/;
  57     float alpha[3] __attribute__((aligned(16)));
  58     int left __attribute__((aligned(4)));
  59     int right __attribute__((aligned(4)));
  60 }
  61 GpuHidHaarTreeNode;
  62 typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
  63 {
  64     int count __attribute__((aligned(4)));
  65     GpuHidHaarTreeNode *node __attribute__((aligned(8)));
  66     float *alpha __attribute__((aligned(8)));
  67 }
  68 GpuHidHaarClassifier;
  69 typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
  70 {
  71     int  count __attribute__((aligned(4)));
  72     float threshold __attribute__((aligned(4)));
  73     int two_rects __attribute__((aligned(4)));
  74     int reserved0 __attribute__((aligned(8)));
  75     int reserved1 __attribute__((aligned(8)));
  76     int reserved2 __attribute__((aligned(8)));
  77     int reserved3 __attribute__((aligned(8)));
  78 }
  79 GpuHidHaarStageClassifier;
  80 typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
  81 {
  82     int  count __attribute__((aligned(4)));
  83     int  is_stump_based __attribute__((aligned(4)));
  84     int  has_tilted_features __attribute__((aligned(4)));
  85     int  is_tree __attribute__((aligned(4)));
  86     int pq0 __attribute__((aligned(4)));
  87     int pq1 __attribute__((aligned(4)));
  88     int pq2 __attribute__((aligned(4)));
  89     int pq3 __attribute__((aligned(4)));
  90     int p0 __attribute__((aligned(4)));
  91     int p1 __attribute__((aligned(4)));
  92     int p2 __attribute__((aligned(4)));
  93     int p3 __attribute__((aligned(4)));
  94     float inv_window_area __attribute__((aligned(4)));
  95 } GpuHidHaarClassifierCascade;
  96
  97 __kernel void gpuRunHaarClassifierCascade_scaled2(
  98     global GpuHidHaarStageClassifier *stagecascadeptr,
  99     global int4 *info,
 100     global GpuHidHaarTreeNode *nodeptr,
 101     global const int *restrict sum,
 102     global const float   *restrict sqsum,
 103     global int4 *candidate,
 104     const int rows,
 105     const int cols,
 106     const int step,
 107     const int loopcount,
 108     const int start_stage,
 109     const int split_stage,
 110     const int end_stage,
 111     const int startnode,
 112     global int4 *p,
 113     global float *correction,
 114     const int nodecount)
 115 {
 116     int grpszx = get_local_size(0);
 117     int grpszy = get_local_size(1);
 118     int grpnumx = get_num_groups(0);
 119     int grpidx = get_group_id(0);
 120     int lclidx = get_local_id(0);
 121     int lclidy = get_local_id(1);
 122     int lcl_id = mad24(lclidy, grpszx, lclidx);
 123     __local int glboutindex[1];
 124     __local int lclcount[1];
 125     __local int lcloutindex[64];
 126     glboutindex[0] = 0;
 127     int outputoff = mul24(grpidx, 256);
 128     candidate[outputoff + (lcl_id << 2)] = (int4)0;
 129     candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
 130     candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
 131     candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
 132     int max_idx = rows * cols - 1;
 133     for (int scalei = 0; scalei < loopcount; scalei++)
 134     {
 135         int4 scaleinfo1;
 136         scaleinfo1 = info[scalei];
 137         int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
 138         int totalgrp = scaleinfo1.y & 0xffff;
 139         float factor = as_float(scaleinfo1.w);
 140         float correction_t = correction[scalei];
 141         float ystep = max(2.0f, factor);
 142
 143         for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
 144         {
 145             int4 cascadeinfo = p[scalei];
 146             int grpidy = grploop / grpnumperline;
 147             int grpidx = grploop - mul24(grpidy, grpnumperline);
 148             int ix = mad24(grpidx, grpszx, lclidx);
 149             int iy = mad24(grpidy, grpszy, lclidy);
 150             int x = round(ix * ystep);
 151             int y = round(iy * ystep);
 152             lcloutindex[lcl_id] = 0;
 153             lclcount[0] = 0;
 154             int nodecounter;
 155             float mean, variance_norm_factor;
 156             //if((ix < width) && (iy < height))
 157             {
 158                 const int p_offset = mad24(y, step, x);
 159                 cascadeinfo.x += p_offset;
 160                 cascadeinfo.z += p_offset;
 161                 mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
 162                 - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
 163                         sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
 164                 + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
 165                        * correction_t;
 166                 variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
 167                 - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
 168                                        sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
 169                 + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
 170                 variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
 171                 variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
 172                 bool result = true;
 173                 nodecounter = startnode + nodecount * scalei;
 174                 for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
 175                 {
 176                     float stage_sum = 0.f;
 177                     int   stagecount = stagecascadeptr[stageloop].count;
 178                     for (int nodeloop = 0; nodeloop < stagecount;)
 179                     {
 180                         __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
 181                         int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
 182                         int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
 183                         int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
 184                         float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
 185                         float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
 186                         float nodethreshold  = w.w * variance_norm_factor;
 187
 188                         info1.x += p_offset;
 189                         info1.z += p_offset;
 190                         info2.x += p_offset;
 191                         info2.z += p_offset;
 192                         info3.x += p_offset;
 193                         info3.z += p_offset;
 194                         float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
 195                         - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
 196                                           sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
 197                         + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
 198                         classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
 199                         - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
 200                                      sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
 201                         + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
 202                         classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
 203                         - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
 204                                      sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
 205                         + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
 206
 207                         bool passThres = classsum >= nodethreshold;
 208
 209 #if STUMP_BASED
 210                         stage_sum += passThres ? alpha3.y : alpha3.x;
 211                         nodecounter++;
 212                         nodeloop++;
 213 #else
 214                         bool isRootNode = (nodecounter & 1) == 0;
 215                         if(isRootNode)
 216                         {
 217                             if( (passThres && currentnodeptr->right) ||
 218                                 (!passThres && currentnodeptr->left))
 219                             {
 220                                 nodecounter ++;
 221                             }
 222                             else
 223                             {
 224                                 stage_sum += alpha3.x;
 225                                 nodecounter += 2;
 226                                 nodeloop ++;
 227                             }
 228                         }
 229                         else
 230                         {
 231                             stage_sum += (passThres ? alpha3.z : alpha3.y);
 232                             nodecounter ++;
 233                             nodeloop ++;
 234                         }
 235 #endif
 236                     }
 237                     result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
 238                 }
 239
 240                 barrier(CLK_LOCAL_MEM_FENCE);
 241
 242                 if (result)
 243                 {
 244                     int queueindex = atomic_inc(lclcount);
 245                     lcloutindex[queueindex] = (y << 16) | x;
 246                 }
 247                 barrier(CLK_LOCAL_MEM_FENCE);
 248                 int queuecount = lclcount[0];
 249
 250                 if (lcl_id < queuecount)
 251                 {
 252                     int temp = lcloutindex[lcl_id];
 253                     int x = temp & 0xffff;
 254                     int y = (temp & (int)0xffff0000) >> 16;
 255                     temp = atomic_inc(glboutindex);
 256                     int4 candidate_result;
 257                     candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
 258                     candidate_result.x = x;
 259                     candidate_result.y = y;
 260
 261                     int i = outputoff+temp+lcl_id;
 262                     if(candidate[i].z == 0)
 263                     {
 264                         candidate[i] = candidate_result;
 265                     }
 266                     else
 267                     {
 268                         for(i=i+1;;i++)
 269                         {
 270                             if(candidate[i].z == 0)
 271                             {
 272                                 candidate[i] = candidate_result;
 273                                 break;
 274                             }
 275                         }
 276                     }
 277                 }
 278
 279                 barrier(CLK_LOCAL_MEM_FENCE);
 280             }
 281         }
 282     }
 283 }
 284 __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
 285 {
 286     int counter = get_global_id(0);
 287     int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
 288     GpuHidHaarTreeNode t1 = *(orinode + counter);
 289
 290     #pragma unroll
 291     for (i = 0; i < 3; i++)
 292     {
 293         tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
 294         tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
 295         tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
 296         tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
 297     }
 298
 299     t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
 300     counter += nodenum;
 301
 302     #pragma unroll
 303     for (i = 0; i < 3; i++)
 304     {
 305         newnode[counter].p[i][0] = tr_x[i];
 306         newnode[counter].p[i][1] = tr_y[i];
 307         newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
 308         newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
 309         newnode[counter].weight[i] = t1.weight[i] * weight_scale;
 310     }
 311
 312     newnode[counter].left = t1.left;
 313     newnode[counter].right = t1.right;
 314     newnode[counter].threshold = t1.threshold;
 315     newnode[counter].alpha[0] = t1.alpha[0];
 316     newnode[counter].alpha[1] = t1.alpha[1];
 317     newnode[counter].alpha[2] = t1.alpha[2];
 318 }