From 2c06e59a69aeaa1a76d0ebd419ec88c77e484feb Mon Sep 17 00:00:00 2001 From: yao Date: Tue, 26 Mar 2013 13:05:01 +0800 Subject: [PATCH] fix some mismatch --- modules/ocl/src/moments.cpp | 54 ++--- modules/ocl/src/opencl/moments.cl | 473 +++++++++++++++++--------------------- modules/ocl/src/opencl/pyr_up.cl | 452 +++++++++++++++++------------------- 3 files changed, 438 insertions(+), 541 deletions(-) diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index 285041d..9679a7b 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; - if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float) + if (!cv::ocl::Context::getContext()->impl->double_support && is_float) { CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); } @@ -143,10 +143,10 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step )); openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1); - + cv::Mat dst(dst_a); a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; - if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE)) + if (!cv::ocl::Context::getContext()->impl->double_support) { for (int i = 0; i < contour->total; ++i) { @@ -161,7 +161,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) a12 += dst.at(8, i); a03 += dst.at(9, i); } - } + } else { a00 = cv::sum(dst.row(0))[0]; @@ -277,16 +277,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) blocky = size.height/TILE_SIZE; else blocky = size.height/TILE_SIZE + 1; - cv::ocl::oclMat dst_m00(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m10(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m01(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m20(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m11(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m02(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m30(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m21(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m12(blocky, blockx, CV_64FC1); - cv::ocl::oclMat dst_m03(blocky, blockx, CV_64FC1); + cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1); cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double)); int tile_width = std::min(size.width,TILE_SIZE); int tile_height = std::min(size.height,TILE_SIZE); @@ -299,25 +290,17 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width )); args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.step )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky )); args.push_back( make_pair( sizeof(cl_int) , (void *)&type )); args.push_back( make_pair( sizeof(cl_int) , (void *)&depth )); args.push_back( make_pair( sizeof(cl_int) , (void *)&cn )); args.push_back( make_pair( sizeof(cl_int) , (void *)&coi )); args.push_back( make_pair( sizeof(cl_int) , (void *)&binary )); args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - openCLExecuteKernel(dst_m00.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); size_t localThreadss[3] = { 128, 1, 1}; size_t globalThreadss[3] = { 128, 1, 1}; @@ -327,20 +310,12 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data )); - openCLExecuteKernel(dst_m00.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); + args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); + args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); + openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); double* dstsum = new double[10]; memset(dstsum,0,10*sizeof(double)); - openCLReadBuffer(dst_m00.clCxt,sum,(void *)dstsum,10*sizeof(double)); + openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double)); mom->m00 = dstsum[0]; mom->m10 = dstsum[1]; mom->m01 = dstsum[2]; @@ -351,6 +326,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) mom->m21 = dstsum[7]; mom->m12 = dstsum[8]; mom->m03 = dstsum[9]; + delete [] dstsum; icvCompleteMomentState( mom ); } diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl index 399ff32..f8d6024 100644 --- a/modules/ocl/src/opencl/moments.cl +++ b/modules/ocl/src/opencl/moments.cl @@ -6,25 +6,27 @@ #pragma OPENCL EXTENSION cl_amd_fp64:enable #endif typedef double T; +typedef double F; +typedef double4 F4; +#define convert_F4 convert_double4 #else -typedef float double; -typedef float4 double4; +typedef float F; +typedef float4 F4; typedef long T; -#define convert_double4 convert_float4 +#define convert_F4 convert_float4 #endif -//#pragma OPENCL EXTENSION cl_amd_printf:enable -//#if defined (DOUBLE_SUPPORT) -#define DST_ROW_A00 0 -#define DST_ROW_A10 1 -#define DST_ROW_A01 2 -#define DST_ROW_A20 3 -#define DST_ROW_A11 4 -#define DST_ROW_A02 5 -#define DST_ROW_A30 6 -#define DST_ROW_A21 7 -#define DST_ROW_A12 8 -#define DST_ROW_A03 9 + +#define DST_ROW_00 0 +#define DST_ROW_10 1 +#define DST_ROW_01 2 +#define DST_ROW_20 3 +#define DST_ROW_11 4 +#define DST_ROW_02 5 +#define DST_ROW_30 6 +#define DST_ROW_21 7 +#define DST_ROW_12 8 +#define DST_ROW_03 9 __kernel void icvContourMoments(int contour_total, __global float* reader_oclmat_data, @@ -60,36 +62,76 @@ __kernel void icvContourMoments(int contour_total, yii_1 = yi_1 + yi; dst_step /= sizeof(T); - *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy; - *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1; - *( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1; - *( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); - *( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); - *( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); - *( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); - *( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); - *( dst_a + DST_ROW_A21 * dst_step + idx) = + *( dst_a + DST_ROW_00 * dst_step + idx) = dxy; + *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1; + *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1; + *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); + *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); + *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); + *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); + *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); + *( dst_a + DST_ROW_21 * dst_step + idx) = dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + xi2 * (yi_1 + 3 * yi)); - *( dst_a + DST_ROW_A12 * dst_step + idx) = + *( dst_a + DST_ROW_12 * dst_step + idx) = dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + yi2 * (xi_1 + 3 * xi)); } -//#endif -//#if defined (DOUBLE_SUPPORT) +__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, + __global F* sum, __global F* dst_m, int dst_step) +{ + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int block_y = src_rows/tile_height; + int block_x = src_cols/tile_width; + int block_num; + + if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) + block_y ++; + if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) + block_x ++; + block_num = block_y * block_x; + __local F dst_sum[10][128]; + if(gidy<128-block_num) + for(int i=0; i<10; i++) + dst_sum[i][gidy+block_num]=0; + barrier(CLK_LOCAL_MEM_FENCE); + + dst_step /= sizeof(F); + if(gidy0; lsize>>=1) + { + if(gidy TILE_SIZE && src_rows % TILE_SIZE != 0) - block_y ++; - if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) - block_x ++; - block_num = block_y * block_x; - __local double dst_sum[10][128]; - if(gidy<128-block_num) - for(int i=0; i<10; i++) - dst_sum[i][gidy+block_num]=0; - barrier(CLK_LOCAL_MEM_FENCE); - if(gidy0; lsize>>=1) - { - if(gidy= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) { - m[9][lidy-bheight] = ((double)py) * sy; // m03 - m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21 + m[9][lidy-bheight] = ((F)py) * sy; // m03 + m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 m[6][lidy-bheight] = x3.s0; // m30 m[5][lidy-bheight] = x0.s0 * sy; // m02 m[4][lidy-bheight] = x1.s0 * lidy; // m11 @@ -714,11 +672,12 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols m[1][lidy-bheight] = x1.s0; // m10 m[0][lidy-bheight] = x0.s0; // m00 } + else if(lidy < bheight) { - lm[9] = ((double)py) * sy; // m03 - lm[8] = ((double)x1.s0) * sy; // m12 - lm[7] = ((double)x2.s0) * lidy; // m21 + lm[9] = ((F)py) * sy; // m03 + lm[8] = ((F)x1.s0) * sy; // m12 + lm[7] = ((F)x2.s0) * lidy; // m21 lm[6] = x3.s0; // m30 lm[5] = x0.s0 * sy; // m02 lm[4] = x1.s0 * lidy; // m11 @@ -741,69 +700,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols } if(lidy == 0&&lidx == 0) { - for(int mt = 0; mt < 10; mt++ ) - mom[mt] = (double)lm[mt]; - + for( int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; if(binary) { - double s = 1./255; + F s = 1./255; for( int mt = 0; mt < 10; mt++ ) mom[mt] *= s; } - double xm = x * mom[0], ym = y * mom[0]; + F xm = x * mom[0], ym = y * mom[0]; // accumulate moments computed in each tile + dst_step /= sizeof(F); // + m00 ( = m00' ) - dst_m00[wgidy*dst_cols+wgidx]= mom[0]; + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; // + m10 ( = m10' + x*m00' ) - dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; // + m01 ( = m01' + y*m00' ) - dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym); + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - }*/ + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } } -//#endif -//#if defined (DOUBLE_SUPPORT) -__kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, - __global double* dst_m00, - __global double* dst_m10, - __global double* dst_m01, - __global double* dst_m20, - __global double* dst_m11, - __global double* dst_m02, - __global double* dst_m30, - __global double* dst_m21, - __global double* dst_m12, - __global double* dst_m03, - int dst_cols, int dst_step, + +__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, + __global F* dst_m, + int dst_cols, int dst_step, int blocky, int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) { - double tmp_coi[4]; // get the coi data - double4 tmp[64]; + F tmp_coi[4]; // get the coi data + F4 tmp[64]; int VLEN_D = 4; // length of vetor int gidy = get_global_id(0); int gidx = get_global_id(1); @@ -820,39 +769,39 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col if(tileSize_width < TILE_SIZE) for(int i = tileSize_width; i < rstep; i++ ) - *((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0; + *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; if( coi > 0 ) for(int i=0; i < tileSize_width; i+=VLEN_D) { for(int j=0; j<4; j++) tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); - tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); + tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); } else for(int i=0; i < tileSize_width; i+=VLEN_D) - tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); - double4 zero = (double4)(0); - double4 full = (double4)(255); + tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); + F4 zero = (F4)(0); + F4 full = (F4)(255); if( binary ) for(int i=0; i < tileSize_width; i+=VLEN_D) tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; - double mom[10]; - __local double m[10][128]; + F mom[10]; + __local F m[10][128]; if(lidy == 0) for(int i=0; i<10; i++) for(int j=0; j<128; j++) m[i][j]=0; barrier(CLK_LOCAL_MEM_FENCE); - double lm[10] = {0}; - double4 x0 = (double4)(0); - double4 x1 = (double4)(0); - double4 x2 = (double4)(0); - double4 x3 = (double4)(0); + F lm[10] = {0}; + F4 x0 = (F4)(0); + F4 x1 = (F4)(0); + F4 x2 = (F4)(0); + F4 x3 = (F4)(0); for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D ) { - double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3); - double4 p = tmp[xt/VLEN_D]; - double4 xp = v_xt * p, xxp = xp * v_xt; + F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); + F4 p = tmp[xt/VLEN_D]; + F4 xp = v_xt * p, xxp = xp * v_xt; x0 += p; x1 += xp; x2 += xxp; @@ -863,13 +812,13 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col x2.s0 += x2.s1 + x2.s2 + x2.s3; x3.s0 += x3.s1 + x3.s2 + x3.s3; - double py = lidy * x0.s0, sy = lidy*lidy; + F py = lidy * x0.s0, sy = lidy*lidy; int bheight = min(tileSize_height, TILE_SIZE/2); if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) { - m[9][lidy-bheight] = ((double)py) * sy; // m03 - m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21 + m[9][lidy-bheight] = ((F)py) * sy; // m03 + m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 m[6][lidy-bheight] = x3.s0; // m30 m[5][lidy-bheight] = x0.s0 * sy; // m02 m[4][lidy-bheight] = x1.s0 * lidy; // m11 @@ -881,9 +830,9 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col else if(lidy < bheight) { - lm[9] = ((double)py) * sy; // m03 - lm[8] = ((double)x1.s0) * sy; // m12 - lm[7] = ((double)x2.s0) * lidy; // m21 + lm[9] = ((F)py) * sy; // m03 + lm[8] = ((F)x1.s0) * sy; // m12 + lm[7] = ((F)x2.s0) * lidy; // m21 lm[6] = x3.s0; // m30 lm[5] = x0.s0 * sy; // m02 lm[4] = x1.s0 * lidy; // m11 @@ -907,47 +856,47 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col if(lidy == 0&&lidx == 0) { for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (double)lm[mt]; + mom[mt] = (F)lm[mt]; if(binary) { - double s = 1./255; + F s = 1./255; for( int mt = 0; mt < 10; mt++ ) mom[mt] *= s; } - double xm = x * mom[0], ym = y * mom[0]; + F xm = x * mom[0], ym = y * mom[0]; // accumulate moments computed in each tile + dst_step /= sizeof(F); // + m00 ( = m00' ) - dst_m00[wgidy*dst_cols+wgidx] = mom[0]; + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; // + m10 ( = m10' + x*m00' ) - dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; // + m01 ( = m01' + y*m00' ) - dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym); + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); } -} -//#endif +} \ No newline at end of file diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl index d603ad6..0b7f0c9 100644 --- a/modules/ocl/src/opencl/pyr_up.cl +++ b/modules/ocl/src/opencl/pyr_up.cl @@ -16,6 +16,8 @@ // // @Authors // Zhang Chunpeng chunpeng@multicorewareinc.com +// Dachuan Zhao, dachuan@multicorewareinc.com +// Yao Wang, yao@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -53,20 +55,22 @@ uchar get_valid_uchar(uchar data) ////////////////////////// CV_8UC1 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - __local float s_srcPatch[10][10]; __local float s_dstPatch[20][16]; + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); - - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = mad24((int)get_group_id(0), (lsizex>>1), tidx) - 1; + int srcy = mad24((int)get_group_id(1), (lsizey>>1), tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -74,25 +78,24 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float sum = 0; - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); if(eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + sum = (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; @@ -103,42 +106,40 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } + sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)]; + } - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } - if (get_local_id(1) > 13) - { + if (get_local_id(1) > 13) + { sum = 0; if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)]; sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); sum = 0; - const int tidy = get_local_id(1); - - sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) dst[x + y * dstStep] = (float)(4.0f * sum); @@ -149,8 +150,8 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, ////////////////////////// CV_16UC1 ///////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -210,13 +211,13 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } + } - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } - if (get_local_id(1) > 13) - { + if (get_local_id(1) > 13) + { sum = 0; if (eveny) @@ -228,7 +229,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; } s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + } barrier(CLK_LOCAL_MEM_FENCE); @@ -251,12 +252,15 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, ////////////////////////// CV_32FC1 ///////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); __local float s_srcPatch[10][10]; __local float s_dstPatch[20][16]; @@ -266,10 +270,10 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, dstStep = dstStep >> 2; - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1; + int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -277,71 +281,67 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float sum = 0; - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); + if(eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + s_dstPatch[2 + tidy][tidx] = sum; - if (get_local_id(1) < 2) + if (tidy < 2) { sum = 0; if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)]; + } - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[tidy][tidx] = sum; + } - if (get_local_id(1) > 13) - { + if (tidy > 13) + { sum = 0; if (eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) dst[x + y * dstStep] = (float)(4.0f * sum); @@ -376,37 +376,16 @@ uchar4 convert_float4_to_uchar4(float4 data) return u4Data; } -float4 int_x_float4(int leftOpr,float4 rightOpr) -{ - float4 result = {0,0,0,0}; - - result.x = rightOpr.x * leftOpr; - result.y = rightOpr.y * leftOpr; - result.z = rightOpr.z * leftOpr; - result.w = rightOpr.w * leftOpr; - - return result; -} - -float4 float4_x_float4(float4 leftOpr,float4 rightOpr) -{ - float4 result; - - result.x = leftOpr.x * rightOpr.x; - result.y = leftOpr.y * rightOpr.y; - result.z = leftOpr.z * rightOpr.z; - result.w = leftOpr.w * rightOpr.w; - - return result; -} - __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); __local float4 s_srcPatch[10][10]; __local float4 s_dstPatch[20][16]; @@ -416,10 +395,10 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, dstStep >>= 2; - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1; + int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -427,17 +406,16 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float4 sum = (float4)(0,0,0,0); - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); @@ -446,63 +424,59 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, if(eveny) { - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + ( evenFlag * co1) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + s_dstPatch[2 + tidy][tidx] = sum; - if (get_local_id(1) < 2) + if (tidy < 2) { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[tidy][tidx] = sum; + } + + if (tidy > 13) + { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); - sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + sum = co3 * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + co1 * s_dstPatch[2 + tidy ][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) { - dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum)); + dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum); } } /////////////////////////////////////////////////////////////////////// @@ -535,8 +509,8 @@ ushort4 convert_float4_to_ushort4(float4 data) __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -580,11 +554,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, if(eveny) { - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; } @@ -596,31 +570,31 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { + sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; } s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + } barrier(CLK_LOCAL_MEM_FENCE); @@ -628,15 +602,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, const int tidy = get_local_id(1); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); - sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)]; + sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)]; + sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)]; + sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)]; + sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)]; if ((x < dstCols) && (y < dstRows)) { - dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum)); + dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum); } } @@ -644,12 +618,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, ////////////////////////// CV_32FC4 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + int srcRows,int dstRows,int srcCols,int dstCols, + int srcOffset,int dstOffset,int srcStep,int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - + const int tidx = get_local_id(0); + const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); + const int lsizey = get_local_size(1); __local float4 s_srcPatch[10][10]; __local float4 s_dstPatch[20][16]; @@ -659,10 +636,10 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, dstStep >>= 4; - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + if( tidx < 10 && tidy < 10 ) { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + tidx) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + tidy) - 1; srcx = abs(srcx); srcx = min(srcCols - 1,srcx); @@ -670,17 +647,16 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = (float4)(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); float4 sum = (float4)(0,0,0,0); - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); + const int evenFlag = (int)((tidx & 1) == 0); + const int oddFlag = (int)((tidx & 1) != 0); + const bool eveny = ((tidy & 1) == 0); float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); @@ -689,59 +665,55 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, if(eveny) { - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]); + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + s_dstPatch[2 + tidy][tidx] = sum; - if (get_local_id(1) < 2) + if (tidy < 2) { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]); - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[tidy][tidx] = sum; + } + + if (tidy > 13) + { sum = 0; if (eveny) { - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]); - sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]); - sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]); + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)]; } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } + s_dstPatch[4 + tidy][tidx] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]); - sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]); - sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]); - sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]); + sum = co3 * s_dstPatch[2 + tidy - 2][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx]; + sum = sum + co1 * s_dstPatch[2 + tidy ][tidx]; + sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx]; + sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) { -- 2.7.4