From 3747d2643f3fdd3e196fecb0785723569369947d Mon Sep 17 00:00:00 2001 From: Andrey Pavlenko Date: Fri, 28 Mar 2014 16:08:11 +0400 Subject: [PATCH] Revert pull request #1929 from @alalek "ocl: added workaround into Haar kernels" This reverts commit 3dcddad88aa13b729313939648c29f420a9f8054. Conflicts: modules/ocl/src/opencl/haarobjectdetect.cl --- modules/ocl/src/opencl/haarobjectdetect.cl | 88 ++++++++---------- modules/ocl/src/opencl/haarobjectdetect_scaled2.cl | 101 ++++++++++----------- 2 files changed, 88 insertions(+), 101 deletions(-) diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 39d11b0..2b834c2 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -62,13 +62,13 @@ typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode GpuHidHaarTreeNode; -//typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier -//{ -// int count __attribute__((aligned (4))); -// GpuHidHaarTreeNode* node __attribute__((aligned (8))); -// float* alpha __attribute__((aligned (8))); -//} -//GpuHidHaarClassifier; +typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier +{ + int count __attribute__((aligned (4))); + GpuHidHaarTreeNode* node __attribute__((aligned (8))); + float* alpha __attribute__((aligned (8))); +} +GpuHidHaarClassifier; typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier @@ -84,22 +84,22 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier GpuHidHaarStageClassifier; -//typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade -//{ -// int count __attribute__((aligned (4))); -// int is_stump_based __attribute__((aligned (4))); -// int has_tilted_features __attribute__((aligned (4))); -// int is_tree __attribute__((aligned (4))); -// int pq0 __attribute__((aligned (4))); -// int pq1 __attribute__((aligned (4))); -// int pq2 __attribute__((aligned (4))); -// int pq3 __attribute__((aligned (4))); -// int p0 __attribute__((aligned (4))); -// int p1 __attribute__((aligned (4))); -// int p2 __attribute__((aligned (4))); -// int p3 __attribute__((aligned (4))); -// float inv_window_area __attribute__((aligned (4))); -//} GpuHidHaarClassifierCascade; +typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade +{ + int count __attribute__((aligned (4))); + int is_stump_based __attribute__((aligned (4))); + int has_tilted_features __attribute__((aligned (4))); + int is_tree __attribute__((aligned (4))); + int pq0 __attribute__((aligned (4))); + int pq1 __attribute__((aligned (4))); + int pq2 __attribute__((aligned (4))); + int pq3 __attribute__((aligned (4))); + int p0 __attribute__((aligned (4))); + int p1 __attribute__((aligned (4))); + int p2 __attribute__((aligned (4))); + int p3 __attribute__((aligned (4))); + float inv_window_area __attribute__((aligned (4))); +} GpuHidHaarClassifierCascade; #ifdef PACKED_CLASSIFIER @@ -196,12 +196,10 @@ __kernel void gpuRunHaarClassifierCascadePacked( for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ ) {// iterate until candidate is valid float stage_sum = 0.0f; - __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) - ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); - int lcl_off = (yl*DATA_SIZE_X)+(xl); - int stagecount = stageinfo->count; - float stagethreshold = stageinfo->threshold; - for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ ) + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x); + for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ ) { // simple macro to extract shorts from int #define M0(_t) ((_t)&0xFFFF) @@ -357,17 +355,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa variance_norm_factor = variance_norm_factor * correction - mean * mean; variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f; - for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) + for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) { float stage_sum = 0.f; - __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) - ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); - int stagecount = stageinfo->count; - float stagethreshold = stageinfo->threshold; - for(int nodeloop = 0; nodeloop < stagecount; ) + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + for(int nodeloop = 0; nodeloop < stageinfo.x; ) { - __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*) - (((__global uchar*)nodeptr) + nodecounter * sizeof(GpuHidHaarTreeNode)); + __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); @@ -423,7 +418,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa #endif } - result = (stage_sum >= stagethreshold) ? 1 : 0; + result = (stage_sum >= stagethreshold); } if(factor < 2) { @@ -452,17 +447,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa lclcount[0]=0; barrier(CLK_LOCAL_MEM_FENCE); - //int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) - ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); - int stagecount = stageinfo->count; - float stagethreshold = stageinfo->threshold; + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); int perfscale = queuecount > 4 ? 3 : 2; int queuecount_loop = (queuecount + (1<> perfscale; int lcl_compute_win = lcl_sz >> perfscale; int lcl_compute_win_id = (lcl_id >>(6-perfscale)); - int lcl_loops = (stagecount + lcl_compute_win -1) >> (6-perfscale); + int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); for(int queueloop=0; queueloopp[0][0])); int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); @@ -557,7 +549,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa queuecount = lclcount[0]; barrier(CLK_LOCAL_MEM_FENCE); - nodecounter += stagecount; + nodecounter += stageinfo.x; }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++) if(lcl_id> 16; int totalgrp = scaleinfo1.y & 0xffff; float factor = as_float(scaleinfo1.w); @@ -173,18 +174,15 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++) { float stage_sum = 0.f; - __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) - (((__global uchar*)stagecascadeptr_)+stageloop*sizeof(GpuHidHaarStageClassifier)); - int stagecount = stageinfo->count; + int stagecount = stagecascadeptr[stageloop].count; for (int nodeloop = 0; nodeloop < stagecount;) { - __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*) - (((__global uchar*)nodeptr_) + nodecounter * sizeof(GpuHidHaarTreeNode)); + __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); - float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); + float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0])); float nodethreshold = w.w * variance_norm_factor; info1.x += p_offset; @@ -206,7 +204,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z; - bool passThres = (classsum >= nodethreshold) ? 1 : 0; + bool passThres = classsum >= nodethreshold; #if STUMP_BASED stage_sum += passThres ? alpha3.y : alpha3.x; @@ -236,8 +234,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( } #endif } - - result = (stage_sum >= stageinfo->threshold) ? 1 : 0; + result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold); } barrier(CLK_LOCAL_MEM_FENCE); @@ -284,14 +281,11 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( } } } -__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, const int nodenum) +__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) { - const int counter = get_global_id(0); + int counter = get_global_id(0); int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; - GpuHidHaarTreeNode t1 = *(__global GpuHidHaarTreeNode*) - (((__global uchar*)orinode) + counter * sizeof(GpuHidHaarTreeNode)); - __global GpuHidHaarTreeNode* pNew = (__global GpuHidHaarTreeNode*) - (((__global uchar*)newnode) + (counter + nodenum) * sizeof(GpuHidHaarTreeNode)); + GpuHidHaarTreeNode t1 = *(orinode + counter); #pragma unroll for (i = 0; i < 3; i++) @@ -303,21 +297,22 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH } t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]); + counter += nodenum; #pragma unroll for (i = 0; i < 3; i++) { - pNew->p[i][0] = tr_x[i]; - pNew->p[i][1] = tr_y[i]; - pNew->p[i][2] = tr_x[i] + tr_w[i]; - pNew->p[i][3] = tr_y[i] + tr_h[i]; - pNew->weight[i] = t1.weight[i] * weight_scale; + newnode[counter].p[i][0] = tr_x[i]; + newnode[counter].p[i][1] = tr_y[i]; + newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; + newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; + newnode[counter].weight[i] = t1.weight[i] * weight_scale; } - pNew->left = t1.left; - pNew->right = t1.right; - pNew->threshold = t1.threshold; - pNew->alpha[0] = t1.alpha[0]; - pNew->alpha[1] = t1.alpha[1]; - pNew->alpha[2] = t1.alpha[2]; + newnode[counter].left = t1.left; + newnode[counter].right = t1.right; + newnode[counter].threshold = t1.threshold; + newnode[counter].alpha[0] = t1.alpha[0]; + newnode[counter].alpha[1] = t1.alpha[1]; + newnode[counter].alpha[2] = t1.alpha[2]; } -- 2.7.4