From: yao Date: Sat, 23 Feb 2013 07:19:46 +0000 (+0800) Subject: fix the haar kernel problems on Nvidia and Intel OCL X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1314^2~1479^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=720eaf1e1aabb4301444c0b0185bb2006a0ae6b8;p=platform%2Fupstream%2Fopencv.git fix the haar kernel problems on Nvidia and Intel OCL --- diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 5c9b75b..26e6a40 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -926,7 +926,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if( gimg.cols < minSize.width || gimg.rows < minSize.height ) CV_Error(CV_StsError, "Image too small"); - if( (flags & CV_HAAR_SCALE_IMAGE) && gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") == string::npos ) + if( (flags & CV_HAAR_SCALE_IMAGE) ) { CvSize winSize0 = cascade->orig_window_size; //float scalefactor = 1.1f; diff --git a/modules/ocl/src/kernels/haarobjectdetect.cl b/modules/ocl/src/kernels/haarobjectdetect.cl index 95cfa63..7835b4b 100644 --- a/modules/ocl/src/kernels/haarobjectdetect.cl +++ b/modules/ocl/src/kernels/haarobjectdetect.cl @@ -9,6 +9,7 @@ // Niko Li, newlife20080214@gmail.com // Wang Weiyan, wangweiyanster@gmail.com // Jia Haipeng, jiahaipeng95@gmail.com +// Nathan, liujun@multicorewareinc.com // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -47,14 +48,14 @@ typedef float sqsumtype; typedef struct __attribute__((aligned (128))) GpuHidHaarFeature { struct __attribute__((aligned (32))) - { - int p0 __attribute__((aligned (4))); - int p1 __attribute__((aligned (4))); - int p2 __attribute__((aligned (4))); - int p3 __attribute__((aligned (4))); - float weight __attribute__((aligned (4))); - } - rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32))); +{ + int p0 __attribute__((aligned (4))); + int p1 __attribute__((aligned (4))); + int p2 __attribute__((aligned (4))); + int p3 __attribute__((aligned (4))); + float weight __attribute__((aligned (4))); +} +rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32))); } GpuHidHaarFeature; @@ -108,31 +109,31 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade int p2 __attribute__((aligned (4))); int p3 __attribute__((aligned (4))); float inv_window_area __attribute__((aligned (4))); -}GpuHidHaarClassifierCascade; +} GpuHidHaarClassifierCascade; __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade, - global GpuHidHaarStageClassifier * stagecascadeptr, - global int4 * info, - global GpuHidHaarTreeNode * nodeptr, - global const int * restrict sum1, - global const float * restrict sqsum1, - global int4 * candidate, - const int pixelstep, - const int loopcount, - const int start_stage, - const int split_stage, - const int end_stage, - const int startnode, - const int splitnode, - const int4 p, - const int4 pq, - const float correction - //const int width, - //const int height, - //const int grpnumperline, - //const int totalgrp - ) + global GpuHidHaarStageClassifier * stagecascadeptr, + global int4 * info, + global GpuHidHaarTreeNode * nodeptr, + global const int * restrict sum1, + global const float * restrict sqsum1, + global int4 * candidate, + const int pixelstep, + const int loopcount, + const int start_stage, + const int split_stage, + const int end_stage, + const int startnode, + const int splitnode, + const int4 p, + const int4 pq, + const float correction + //const int width, + //const int height, + //const int grpnumperline, + //const int totalgrp +) { int grpszx = get_local_size(0); int grpszy = get_local_size(1); @@ -184,7 +185,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa __global const int * sum = sum1 + imgoff; __global const float * sqsum = sqsum1 + imgoff; - for(int grploop=grpidx;grploop=0.f ? sqrt(variance_norm_factor) : 1.f; @@ -270,19 +271,19 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa info2.z +=lcl_off; float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; //if((info3.z - info3.x) && (!stageinfo.z)) //{ - info3.x +=lcl_off; - info3.z +=lcl_off; - classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; + info3.x +=lcl_off; + info3.z +=lcl_off; + classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - + lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; //} stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; nodecounter++; @@ -299,12 +300,13 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa } barrier(CLK_LOCAL_MEM_FENCE); int queuecount = lclcount[0]; + barrier(CLK_LOCAL_MEM_FENCE); nodecounter = splitnode; - for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++) + for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) { - //barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); //if(lcl_id == 0) - lclcount[0]=0; + lclcount[0]=0; barrier(CLK_LOCAL_MEM_FENCE); int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); @@ -316,70 +318,73 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int lcl_compute_win_id = (lcl_id >>(6-perfscale)); int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); - for(int queueloop=0;queueloop>16),readwidth,temp_coord & 0xffff); - //barrier(CLK_LOCAL_MEM_FENCE); - if(lcl_compute_win_id < queuecount) { - - int tempnodecounter = lcl_compute_id; - float part_sum = 0.f; - for(int lcl_loop=0;lcl_loopp[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; + int tempnodecounter = lcl_compute_id; + float part_sum = 0.f; + for(int lcl_loop=0; lcl_loopp[0][0])); + int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; - float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + info1.x +=queue_pixel; + info1.z +=queue_pixel; + info2.x +=queue_pixel; + info2.z +=queue_pixel; + float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; - classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; - //if((info3.z - info3.x) && (!stageinfo.z)) - //{ + + classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + //if((info3.z - info3.x) && (!stageinfo.z)) + //{ info3.x +=queue_pixel; info3.z +=queue_pixel; classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - //} - part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - tempnodecounter +=lcl_compute_win; - }//end for(int lcl_loop=0;lcl_loop= nodethreshold ? alpha2.y : alpha2.x; + tempnodecounter +=lcl_compute_win; + }//end for(int lcl_loop=0;lcl_loop= stagethreshold && (lcl_compute_id==0)) + barrier(CLK_LOCAL_MEM_FENCE); + if(lcl_compute_win_id < queuecount) { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = temp_coord; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); + for(int i=0; i= stagethreshold && (lcl_compute_id==0)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex<<1] = temp_coord; + lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); + } + lcl_compute_win_id +=(1<0;stageloop++) //barrier(CLK_LOCAL_MEM_FENCE); @@ -420,138 +425,138 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa - /* - if(stagecascade->two_rects) - { - #pragma unroll - for( n = 0; n < stagecascade->count; n++ ) - { - t1 = *(node + counter); - t = t1.threshold * variance_norm_factor; - classsum = calc_sum1(t1,p_offset,0) * t1.weight[0]; +/* +if(stagecascade->two_rects) +{ + #pragma unroll + for( n = 0; n < stagecascade->count; n++ ) + { + t1 = *(node + counter); + t = t1.threshold * variance_norm_factor; + classsum = calc_sum1(t1,p_offset,0) * t1.weight[0]; - classsum += calc_sum1(t1, p_offset,1) * t1.weight[1]; - stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0]; + classsum += calc_sum1(t1, p_offset,1) * t1.weight[1]; + stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0]; - counter++; - } - } - else - { - #pragma unroll - for( n = 0; n < stagecascade->count; n++ ) - { - t = node[counter].threshold*variance_norm_factor; - classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0]; - classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1]; + counter++; + } +} +else +{ + #pragma unroll + for( n = 0; n < stagecascade->count; n++ ) + { + t = node[counter].threshold*variance_norm_factor; + classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0]; + classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1]; - if( node[counter].p0[2] ) - classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2]; + if( node[counter].p0[2] ) + classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2]; - stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify + stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify - counter++; - } - } - */ - /* + counter++; + } +} +*/ +/* __kernel void gpuRunHaarClassifierCascade_ScaleWindow( - constant GpuHidHaarClassifierCascade * _cascade, - global GpuHidHaarStageClassifier * stagecascadeptr, - //global GpuHidHaarClassifier * classifierptr, - global GpuHidHaarTreeNode * nodeptr, - global int * sum, - global float * sqsum, - global int * _candidate, - int pixel_step, - int cols, - int rows, - int start_stage, - int end_stage, - //int counts, - int nodenum, - int ystep, - int detect_width, - //int detect_height, - int loopcount, - int outputstep) - //float scalefactor) + constant GpuHidHaarClassifierCascade * _cascade, + global GpuHidHaarStageClassifier * stagecascadeptr, + //global GpuHidHaarClassifier * classifierptr, + global GpuHidHaarTreeNode * nodeptr, + global int * sum, + global float * sqsum, + global int * _candidate, + int pixel_step, + int cols, + int rows, + int start_stage, + int end_stage, + //int counts, + int nodenum, + int ystep, + int detect_width, + //int detect_height, + int loopcount, + int outputstep) + //float scalefactor) { - unsigned int x1 = get_global_id(0); - unsigned int y1 = get_global_id(1); - int p_offset; - int m, n; - int result; - int counter; - float mean, variance_norm_factor; - for(int i=0;ip1 - cascade->p0; - int window_height = window_width; - result = 1; - counter = 0; - unsigned int x = mul24(x1,ystep); - unsigned int y = mul24(y1,ystep); - if((x < cols - window_width - 1) && (y < rows - window_height -1)) - { - global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage; - //global GpuHidHaarClassifier *classifier = classifierptr; - global GpuHidHaarTreeNode *node = nodeptr + nodenum*i; +unsigned int x1 = get_global_id(0); +unsigned int y1 = get_global_id(1); +int p_offset; +int m, n; +int result; +int counter; +float mean, variance_norm_factor; +for(int i=0;ip1 - cascade->p0; +int window_height = window_width; +result = 1; +counter = 0; +unsigned int x = mul24(x1,ystep); +unsigned int y = mul24(y1,ystep); +if((x < cols - window_width - 1) && (y < rows - window_height -1)) +{ +global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage; +//global GpuHidHaarClassifier *classifier = classifierptr; +global GpuHidHaarTreeNode *node = nodeptr + nodenum*i; - p_offset = mad24(y, pixel_step, x);// modify +p_offset = mad24(y, pixel_step, x);// modify - mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) - - *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3)) - *cascade->inv_window_area; +mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) - + *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3)) + *cascade->inv_window_area; - variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) - - *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset); - variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean; - variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify +variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) - + *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset); +variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean; +variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify - // if( cascade->is_stump_based ) - //{ - for( m = start_stage; m < end_stage; m++ ) - { - float stage_sum = 0.f; - float t, classsum; - GpuHidHaarTreeNode t1; +// if( cascade->is_stump_based ) +//{ +for( m = start_stage; m < end_stage; m++ ) +{ +float stage_sum = 0.f; +float t, classsum; +GpuHidHaarTreeNode t1; - //#pragma unroll - for( n = 0; n < stagecascade->count; n++ ) - { - t1 = *(node + counter); - t = t1.threshold * variance_norm_factor; - classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1]; +//#pragma unroll +for( n = 0; n < stagecascade->count; n++ ) +{ + t1 = *(node + counter); + t = t1.threshold * variance_norm_factor; + classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1]; - if((t1.p0[2]) && (!stagecascade->two_rects)) - classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2]; + if((t1.p0[2]) && (!stagecascade->two_rects)) + classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2]; - stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify - counter++; - } + stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify + counter++; +} - if (stage_sum < stagecascade->threshold) - { - result = 0; - break; - } +if (stage_sum < stagecascade->threshold) +{ + result = 0; + break; +} - stagecascade++; +stagecascade++; - } - if(result) - { - candidate[4 * (y1 * detect_width + x1)] = x; - candidate[4 * (y1 * detect_width + x1) + 1] = y; - candidate[4 * (y1 * detect_width + x1)+2] = window_width; - candidate[4 * (y1 * detect_width + x1) + 3] = window_height; - } - //} - } - } +} +if(result) +{ + candidate[4 * (y1 * detect_width + x1)] = x; + candidate[4 * (y1 * detect_width + x1) + 1] = y; + candidate[4 * (y1 * detect_width + x1)+2] = window_width; + candidate[4 * (y1 * detect_width + x1) + 3] = window_height; +} +//} +} +} } */