modules/ocl/src/opencl/haarobjectdetect_scaled2.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Wu Xinglong, wxl370@126.com
  19 //    Sen Liu, swjtuls1987@126.com
  20 //    Peng Xiao, pengxiao@outlook.com
  21 // Redistribution and use in source and binary forms, with or without modification,
  22 // are permitted provided that the following conditions are met:
  23 //
  24 //   * Redistribution's of source code must retain the above copyright notice,
  25 //     this list of conditions and the following disclaimer.
  26 //
  27 //   * Redistribution's in binary form must reproduce the above copyright notice,
  28 //     this list of conditions and the following disclaimer in the documentation
  29 //     and/or other oclMaterials provided with the distribution.
  30 //
  31 //   * The name of the copyright holders may not be used to endorse or promote products
  32 //     derived from this software without specific prior written permission.
  33 //
  34 // This software is provided by the copyright holders and contributors as is and
  35 // any express or implied warranties, including, but not limited to, the implied
  36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  37 // In no event shall the Intel Corporation or contributors be liable for any direct,
  38 // indirect, incidental, special, exemplary, or consequential damages
  39 // (including, but not limited to, procurement of substitute goods or services;
  40 // loss of use, data, or profits; or business interruption) however caused
  41 // and on any theory of liability, whether in contract, strict liability,
  42 // or tort (including negligence or otherwise) arising in any way out of
  43 // the use of this software, even if advised of the possibility of such damage.
  44 //
  45 //M*/
  46
  47 // Enter your kernel in this window
  48 //#pragma OPENCL EXTENSION cl_amd_printf:enable
  49 #define CV_HAAR_FEATURE_MAX           3
  50 typedef int   sumtype;
  51 typedef float sqsumtype;
  52
  53 typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
  54 {
  55     int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
  56     float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
  57     float threshold /*__attribute__((aligned (4)))*/;
  58     float alpha[3] __attribute__((aligned(16)));
  59     int left __attribute__((aligned(4)));
  60     int right __attribute__((aligned(4)));
  61 }
  62 GpuHidHaarTreeNode;
  63 typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
  64 {
  65     int count __attribute__((aligned(4)));
  66     GpuHidHaarTreeNode *node __attribute__((aligned(8)));
  67     float *alpha __attribute__((aligned(8)));
  68 }
  69 GpuHidHaarClassifier;
  70 typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
  71 {
  72     int  count __attribute__((aligned(4)));
  73     float threshold __attribute__((aligned(4)));
  74     int two_rects __attribute__((aligned(4)));
  75     int reserved0 __attribute__((aligned(8)));
  76     int reserved1 __attribute__((aligned(8)));
  77     int reserved2 __attribute__((aligned(8)));
  78     int reserved3 __attribute__((aligned(8)));
  79 }
  80 GpuHidHaarStageClassifier;
  81 typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
  82 {
  83     int  count __attribute__((aligned(4)));
  84     int  is_stump_based __attribute__((aligned(4)));
  85     int  has_tilted_features __attribute__((aligned(4)));
  86     int  is_tree __attribute__((aligned(4)));
  87     int pq0 __attribute__((aligned(4)));
  88     int pq1 __attribute__((aligned(4)));
  89     int pq2 __attribute__((aligned(4)));
  90     int pq3 __attribute__((aligned(4)));
  91     int p0 __attribute__((aligned(4)));
  92     int p1 __attribute__((aligned(4)));
  93     int p2 __attribute__((aligned(4)));
  94     int p3 __attribute__((aligned(4)));
  95     float inv_window_area __attribute__((aligned(4)));
  96 } GpuHidHaarClassifierCascade;
  97
  98 __kernel void gpuRunHaarClassifierCascade_scaled2(
  99     global GpuHidHaarStageClassifier *stagecascadeptr,
 100     global int4 *info,
 101     global GpuHidHaarTreeNode *nodeptr,
 102     global const int *restrict sum,
 103     global const float   *restrict sqsum,
 104     global int4 *candidate,
 105     const int rows,
 106     const int cols,
 107     const int step,
 108     const int loopcount,
 109     const int start_stage,
 110     const int split_stage,
 111     const int end_stage,
 112     const int startnode,
 113     global int4 *p,
 114     global float *correction,
 115     const int nodecount)
 116 {
 117     int grpszx = get_local_size(0);
 118     int grpszy = get_local_size(1);
 119     int grpnumx = get_num_groups(0);
 120     int grpidx = get_group_id(0);
 121     int lclidx = get_local_id(0);
 122     int lclidy = get_local_id(1);
 123     int lcl_sz = mul24(grpszx, grpszy);
 124     int lcl_id = mad24(lclidy, grpszx, lclidx);
 125     __local int glboutindex[1];
 126     __local int lclcount[1];
 127     __local int lcloutindex[64];
 128     glboutindex[0] = 0;
 129     int outputoff = mul24(grpidx, 256);
 130     candidate[outputoff + (lcl_id << 2)] = (int4)0;
 131     candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
 132     candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
 133     candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
 134     int max_idx = rows * cols - 1;
 135     for (int scalei = 0; scalei < loopcount; scalei++)
 136     {
 137         int4 scaleinfo1;
 138         scaleinfo1 = info[scalei];
 139         int width = (scaleinfo1.x & 0xffff0000) >> 16;
 140         int height = scaleinfo1.x & 0xffff;
 141         int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
 142         int totalgrp = scaleinfo1.y & 0xffff;
 143         float factor = as_float(scaleinfo1.w);
 144         float correction_t = correction[scalei];
 145         int ystep = (int)(max(2.0f, factor) + 0.5f);
 146
 147         for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
 148         {
 149             int4 cascadeinfo = p[scalei];
 150             int grpidy = grploop / grpnumperline;
 151             int grpidx = grploop - mul24(grpidy, grpnumperline);
 152             int ix = mad24(grpidx, grpszx, lclidx);
 153             int iy = mad24(grpidy, grpszy, lclidy);
 154             int x = ix * ystep;
 155             int y = iy * ystep;
 156             lcloutindex[lcl_id] = 0;
 157             lclcount[0] = 0;
 158             int nodecounter;
 159             float mean, variance_norm_factor;
 160             //if((ix < width) && (iy < height))
 161             {
 162                 const int p_offset = mad24(y, step, x);
 163                 cascadeinfo.x += p_offset;
 164                 cascadeinfo.z += p_offset;
 165                 mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
 166                 - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
 167                         sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
 168                 + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
 169                        * correction_t;
 170                 variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
 171                 - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
 172                                        sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
 173                 + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
 174                 variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
 175                 variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
 176                 bool result = true;
 177                 nodecounter = startnode + nodecount * scalei;
 178                 for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
 179                 {
 180                     float stage_sum = 0.f;
 181                     int   stagecount = stagecascadeptr[stageloop].count;
 182                     for (int nodeloop = 0; nodeloop < stagecount;)
 183                     {
 184                         __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
 185                         int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
 186                         int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
 187                         int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
 188                         float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
 189                         float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
 190                         float nodethreshold  = w.w * variance_norm_factor;
 191
 192                         info1.x += p_offset;
 193                         info1.z += p_offset;
 194                         info2.x += p_offset;
 195                         info2.z += p_offset;
 196                         info3.x += p_offset;
 197                         info3.z += p_offset;
 198                         float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
 199                         - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
 200                                           sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
 201                         + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
 202                         classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
 203                         - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
 204                                      sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
 205                         + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
 206                         classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
 207                         - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
 208                                      sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
 209                         + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
 210
 211                         bool passThres = classsum >= nodethreshold;
 212
 213 #if STUMP_BASED
 214                         stage_sum += passThres ? alpha3.y : alpha3.x;
 215                         nodecounter++;
 216                         nodeloop++;
 217 #else
 218                         bool isRootNode = (nodecounter & 1) == 0;
 219                         if(isRootNode)
 220                         {
 221                             if( (passThres && currentnodeptr->right) ||
 222                                 (!passThres && currentnodeptr->left))
 223                             {
 224                                 nodecounter ++;
 225                             }
 226                             else
 227                             {
 228                                 stage_sum += alpha3.x;
 229                                 nodecounter += 2;
 230                                 nodeloop ++;
 231                             }
 232                         }
 233                         else
 234                         {
 235                             stage_sum += (passThres ? alpha3.z : alpha3.y);
 236                             nodecounter ++;
 237                             nodeloop ++;
 238                         }
 239 #endif
 240                     }
 241                     result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
 242                 }
 243
 244                 barrier(CLK_LOCAL_MEM_FENCE);
 245
 246                 if (result && (ix < width) && (iy < height))
 247                 {
 248                     int queueindex = atomic_inc(lclcount);
 249                     lcloutindex[queueindex] = (y << 16) | x;
 250                 }
 251                 barrier(CLK_LOCAL_MEM_FENCE);
 252                 int queuecount = lclcount[0];
 253
 254                 if (lcl_id < queuecount)
 255                 {
 256                     int temp = lcloutindex[lcl_id];
 257                     int x = temp & 0xffff;
 258                     int y = (temp & (int)0xffff0000) >> 16;
 259                     temp = atomic_inc(glboutindex);
 260                     int4 candidate_result;
 261                     candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
 262                     candidate_result.x = x;
 263                     candidate_result.y = y;
 264                     candidate[outputoff + temp + lcl_id] = candidate_result;
 265                 }
 266
 267                 barrier(CLK_LOCAL_MEM_FENCE);
 268             }
 269         }
 270     }
 271 }
 272 __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
 273 {
 274     int counter = get_global_id(0);
 275     int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
 276     GpuHidHaarTreeNode t1 = *(orinode + counter);
 277 #pragma unroll
 278
 279     for (i = 0; i < 3; i++)
 280     {
 281         tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
 282         tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
 283         tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
 284         tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
 285     }
 286
 287     t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
 288     counter += nodenum;
 289 #pragma unroll
 290
 291     for (i = 0; i < 3; i++)
 292     {
 293         newnode[counter].p[i][0] = tr_x[i];
 294         newnode[counter].p[i][1] = tr_y[i];
 295         newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
 296         newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
 297         newnode[counter].weight[i] = t1.weight[i] * weight_scale;
 298     }
 299
 300     newnode[counter].left = t1.left;
 301     newnode[counter].right = t1.right;
 302     newnode[counter].threshold = t1.threshold;
 303     newnode[counter].alpha[0] = t1.alpha[0];
 304     newnode[counter].alpha[1] = t1.alpha[1];
 305     newnode[counter].alpha[2] = t1.alpha[2];
 306 }