bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
- if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float)
+ if (!cv::ocl::Context::getContext()->impl->double_support && is_float)
{
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
}
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step ));
openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
-
+
cv::Mat dst(dst_a);
a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
- if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+ if (!cv::ocl::Context::getContext()->impl->double_support)
{
for (int i = 0; i < contour->total; ++i)
{
a12 += dst.at<cl_long>(8, i);
a03 += dst.at<cl_long>(9, i);
}
- }
+ }
else
{
a00 = cv::sum(dst.row(0))[0];
blocky = size.height/TILE_SIZE;
else
blocky = size.height/TILE_SIZE + 1;
- cv::ocl::oclMat dst_m00(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m10(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m01(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m20(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m11(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m02(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m30(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m21(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m12(blocky, blockx, CV_64FC1);
- cv::ocl::oclMat dst_m03(blocky, blockx, CV_64FC1);
+ cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1);
cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
int tile_width = std::min(size.width,TILE_SIZE);
int tile_height = std::min(size.height,TILE_SIZE);
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
- args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
- args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.cols ));
- args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.step ));
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
- openCLExecuteKernel(dst_m00.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
+ openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
size_t localThreadss[3] = { 128, 1, 1};
size_t globalThreadss[3] = { 128, 1, 1};
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
- args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
- openCLExecuteKernel(dst_m00.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
+ args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
+ args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
+ openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
double* dstsum = new double[10];
memset(dstsum,0,10*sizeof(double));
- openCLReadBuffer(dst_m00.clCxt,sum,(void *)dstsum,10*sizeof(double));
+ openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double));
mom->m00 = dstsum[0];
mom->m10 = dstsum[1];
mom->m01 = dstsum[2];
mom->m21 = dstsum[7];
mom->m12 = dstsum[8];
mom->m03 = dstsum[9];
+ delete [] dstsum;
icvCompleteMomentState( mom );
}
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double T;
+typedef double F;
+typedef double4 F4;
+#define convert_F4 convert_double4
#else
-typedef float double;
-typedef float4 double4;
+typedef float F;
+typedef float4 F4;
typedef long T;
-#define convert_double4 convert_float4
+#define convert_F4 convert_float4
#endif
-//#pragma OPENCL EXTENSION cl_amd_printf:enable
-//#if defined (DOUBLE_SUPPORT)
-#define DST_ROW_A00 0
-#define DST_ROW_A10 1
-#define DST_ROW_A01 2
-#define DST_ROW_A20 3
-#define DST_ROW_A11 4
-#define DST_ROW_A02 5
-#define DST_ROW_A30 6
-#define DST_ROW_A21 7
-#define DST_ROW_A12 8
-#define DST_ROW_A03 9
+
+#define DST_ROW_00 0
+#define DST_ROW_10 1
+#define DST_ROW_01 2
+#define DST_ROW_20 3
+#define DST_ROW_11 4
+#define DST_ROW_02 5
+#define DST_ROW_30 6
+#define DST_ROW_21 7
+#define DST_ROW_12 8
+#define DST_ROW_03 9
__kernel void icvContourMoments(int contour_total,
__global float* reader_oclmat_data,
yii_1 = yi_1 + yi;
dst_step /= sizeof(T);
- *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
- *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
- *( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1;
- *( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
- *( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
- *( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
- *( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
- *( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
- *( dst_a + DST_ROW_A21 * dst_step + idx) =
+ *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
+ *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
+ *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
+ *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
+ *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
+ *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
+ *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
+ *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
+ *( dst_a + DST_ROW_21 * dst_step + idx) =
dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
xi2 * (yi_1 + 3 * yi));
- *( dst_a + DST_ROW_A12 * dst_step + idx) =
+ *( dst_a + DST_ROW_12 * dst_step + idx) =
dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
yi2 * (xi_1 + 3 * xi));
}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
+__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
+ __global F* sum, __global F* dst_m, int dst_step)
+{
+ int gidy = get_global_id(0);
+ int gidx = get_global_id(1);
+ int block_y = src_rows/tile_height;
+ int block_x = src_cols/tile_width;
+ int block_num;
+
+ if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
+ block_y ++;
+ if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
+ block_x ++;
+ block_num = block_y * block_x;
+ __local F dst_sum[10][128];
+ if(gidy<128-block_num)
+ for(int i=0; i<10; i++)
+ dst_sum[i][gidy+block_num]=0;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ dst_step /= sizeof(F);
+ if(gidy<block_num)
+ {
+ dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
+ dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
+ dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
+ dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
+ dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
+ dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
+ dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
+ dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
+ dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
+ dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for(int lsize=64; lsize>0; lsize>>=1)
+ {
+ if(gidy<lsize)
+ {
+ int lsize2 = gidy + lsize;
+ for(int i=0; i<10; i++)
+ dst_sum[i][gidy] += dst_sum[i][lsize2];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ if(gidy==0)
+ for(int i=0; i<10; i++)
+ sum[i] = dst_sum[i][0];
+}
+
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
- __global double* dst_m00,
- __global double* dst_m10,
- __global double* dst_m01,
- __global double* dst_m20,
- __global double* dst_m11,
- __global double* dst_m02,
- __global double* dst_m30,
- __global double* dst_m21,
- __global double* dst_m12,
- __global double* dst_m03,
- int dst_cols, int dst_step, int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
+ __global F* dst_m,
+ int dst_cols, int dst_step, int blocky,
+ int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
{
uchar tmp_coi[16]; // get the coi data
uchar16 tmp[16];
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
- double mom[10];
+ F mom[10];
__local int m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
- mom[mt] = (double)lm[mt];
+ mom[mt] = (F)lm[mt];
if(binary)
{
- double s = 1./255;
+ F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
- double xm = x * mom[0], ym = y * mom[0];
+ F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
+ dst_step /= sizeof(F);
// + m00 ( = m00' )
- dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+ *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
- dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+ *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
- dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+ *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
+ *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
+ *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
+ *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+ *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+ *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+ *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+ *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
-__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, __global double* sum, __global double* dst_m00,
- __global double* dst_m10,
- __global double* dst_m01,
- __global double* dst_m20,
- __global double* dst_m11,
- __global double* dst_m02,
- __global double* dst_m30,
- __global double* dst_m21,
- __global double* dst_m12,
- __global double* dst_m03)
-{
- int gidy = get_global_id(0);
- int gidx = get_global_id(1);
- int block_y = src_rows/tile_height;
- int block_x = src_cols/tile_width;
- int block_num;
- if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
- block_y ++;
- if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
- block_x ++;
- block_num = block_y * block_x;
- __local double dst_sum[10][128];
- if(gidy<128-block_num)
- for(int i=0; i<10; i++)
- dst_sum[i][gidy+block_num]=0;
- barrier(CLK_LOCAL_MEM_FENCE);
- if(gidy<block_num)
- {
- dst_sum[0][gidy] = dst_m00[gidy];
- dst_sum[1][gidy] = dst_m10[gidy];
- dst_sum[2][gidy] = dst_m01[gidy];
- dst_sum[3][gidy] = dst_m20[gidy];
- dst_sum[4][gidy] = dst_m11[gidy];
- dst_sum[5][gidy] = dst_m02[gidy];
- dst_sum[6][gidy] = dst_m30[gidy];
- dst_sum[7][gidy] = dst_m21[gidy];
- dst_sum[8][gidy] = dst_m12[gidy];
- dst_sum[9][gidy] = dst_m03[gidy];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- for(int lsize=64; lsize>0; lsize>>=1)
- {
- if(gidy<lsize)
- {
- int lsize2 = gidy + lsize;
- for(int i=0; i<10; i++)
- dst_sum[i][gidy] += dst_sum[i][lsize2];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(gidy==0)
- for(int i=0; i<10; i++)
- sum[i] = dst_sum[i][0];
-}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
- __global double* dst_m00,
- __global double* dst_m10,
- __global double* dst_m01,
- __global double* dst_m20,
- __global double* dst_m11,
- __global double* dst_m02,
- __global double* dst_m30,
- __global double* dst_m21,
- __global double* dst_m12,
- __global double* dst_m03,
- int dst_cols, int dst_step,
+ __global F* dst_m,
+ int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
ushort tmp_coi[8]; // get the coi data
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
- double mom[10];
+ F mom[10];
__local long m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
if(lidy == 0&&lidx == 0)
{
for(int mt = 0; mt < 10; mt++ )
- mom[mt] = (double)lm[mt];
+ mom[mt] = (F)lm[mt];
if(binary)
{
- double s = 1./255;
+ F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
- double xm = x *mom[0], ym = y * mom[0];
+ F xm = x *mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
+ dst_step /= sizeof(F);
// + m00 ( = m00' )
- dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+ *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
- dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+ *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
- dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+ *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
+ *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
+ *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
+ *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+ *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+ *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+ *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+ *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
+
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
- __global double* dst_m00,
- __global double* dst_m10,
- __global double* dst_m01,
- __global double* dst_m20,
- __global double* dst_m11,
- __global double* dst_m02,
- __global double* dst_m30,
- __global double* dst_m21,
- __global double* dst_m12,
- __global double* dst_m03,
- int dst_cols, int dst_step,
+ __global F* dst_m,
+ int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
short tmp_coi[8]; // get the coi data
for(int i=0; i < tileSize_width; i+=(VLEN_S))
tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
- double mom[10];
+ F mom[10];
__local long m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
if(lidy ==0 &&lidx ==0)
{
for(int mt = 0; mt < 10; mt++ )
- mom[mt] = (double)lm[mt];
+ mom[mt] = (F)lm[mt];
if(binary)
{
- double s = 1./255;
+ F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
- double xm = x * mom[0], ym = y*mom[0];
+ F xm = x * mom[0], ym = y*mom[0];
// accumulate moments computed in each tile
+ dst_step /= sizeof(F);
// + m00 ( = m00' )
- dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+ *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
- dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+ *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
- dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+ *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
+ *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
+ *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
+ *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+ *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+ *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+ *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+ *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
+
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
- __global double* dst_m00,
- __global double* dst_m10,
- __global double* dst_m01,
- __global double* dst_m20,
- __global double* dst_m11,
- __global double* dst_m02,
- __global double* dst_m30,
- __global double* dst_m21,
- __global double* dst_m12,
- __global double* dst_m03,
- int dst_cols, int dst_step,
+ __global F* dst_m,
+ int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
float tmp_coi[4]; // get the coi data
if( binary )
for(int i=0; i < tileSize_width; i+=4)
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
- double mom[10];
- __local double m[10][128];
+ F mom[10];
+ __local F m[10][128];
if(lidy == 0)
for(int i = 0; i < 10; i ++)
for(int j = 0; j < 128; j ++)
m[i][j] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
- double lm[10] = {0};
- double4 x0 = (double4)(0);
- double4 x1 = (double4)(0);
- double4 x2 = (double4)(0);
- double4 x3 = (double4)(0);
+ F lm[10] = {0};
+ F4 x0 = (F4)(0);
+ F4 x1 = (F4)(0);
+ F4 x2 = (F4)(0);
+ F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
{
- double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
- double4 p = convert_double4(tmp[xt/VLEN_F]);
- double4 xp = v_xt * p, xxp = xp * v_xt;
+ F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+ F4 p = convert_F4(tmp[xt/VLEN_F]);
+ F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
-/*
- double py = lidy * x0.s0, sy = lidy*lidy;
+
+ F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
- m[9][lidy-bheight] = ((double)py) * sy; // m03
- m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12
- m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21
+ m[9][lidy-bheight] = ((F)py) * sy; // m03
+ m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
+ m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
+
else if(lidy < bheight)
{
- lm[9] = ((double)py) * sy; // m03
- lm[8] = ((double)x1.s0) * sy; // m12
- lm[7] = ((double)x2.s0) * lidy; // m21
+ lm[9] = ((F)py) * sy; // m03
+ lm[8] = ((F)x1.s0) * sy; // m12
+ lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
}
if(lidy == 0&&lidx == 0)
{
- for(int mt = 0; mt < 10; mt++ )
- mom[mt] = (double)lm[mt];
-
+ for( int mt = 0; mt < 10; mt++ )
+ mom[mt] = (F)lm[mt];
if(binary)
{
- double s = 1./255;
+ F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
- double xm = x * mom[0], ym = y * mom[0];
+ F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
+ dst_step /= sizeof(F);
// + m00 ( = m00' )
- dst_m00[wgidy*dst_cols+wgidx]= mom[0];
+ *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
- dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+ *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
- dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+ *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
+ *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
+ *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym);
+ *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+ *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+ *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+ *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
- }*/
+ *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+ }
}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
-__kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
- __global double* dst_m00,
- __global double* dst_m10,
- __global double* dst_m01,
- __global double* dst_m20,
- __global double* dst_m11,
- __global double* dst_m02,
- __global double* dst_m30,
- __global double* dst_m21,
- __global double* dst_m12,
- __global double* dst_m03,
- int dst_cols, int dst_step,
+
+__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+ __global F* dst_m,
+ int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
- double tmp_coi[4]; // get the coi data
- double4 tmp[64];
+ F tmp_coi[4]; // get the coi data
+ F4 tmp[64];
int VLEN_D = 4; // length of vetor
int gidy = get_global_id(0);
int gidx = get_global_id(1);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
- *((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+ *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_D)
{
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
- tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+ tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_D)
- tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
- double4 zero = (double4)(0);
- double4 full = (double4)(255);
+ tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+ F4 zero = (F4)(0);
+ F4 full = (F4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
- double mom[10];
- __local double m[10][128];
+ F mom[10];
+ __local F m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE);
- double lm[10] = {0};
- double4 x0 = (double4)(0);
- double4 x1 = (double4)(0);
- double4 x2 = (double4)(0);
- double4 x3 = (double4)(0);
+ F lm[10] = {0};
+ F4 x0 = (F4)(0);
+ F4 x1 = (F4)(0);
+ F4 x2 = (F4)(0);
+ F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
{
- double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
- double4 p = tmp[xt/VLEN_D];
- double4 xp = v_xt * p, xxp = xp * v_xt;
+ F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+ F4 p = tmp[xt/VLEN_D];
+ F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
- double py = lidy * x0.s0, sy = lidy*lidy;
+ F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
- m[9][lidy-bheight] = ((double)py) * sy; // m03
- m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12
- m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21
+ m[9][lidy-bheight] = ((F)py) * sy; // m03
+ m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
+ m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
else if(lidy < bheight)
{
- lm[9] = ((double)py) * sy; // m03
- lm[8] = ((double)x1.s0) * sy; // m12
- lm[7] = ((double)x2.s0) * lidy; // m21
+ lm[9] = ((F)py) * sy; // m03
+ lm[8] = ((F)x1.s0) * sy; // m12
+ lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
- mom[mt] = (double)lm[mt];
+ mom[mt] = (F)lm[mt];
if(binary)
{
- double s = 1./255;
+ F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
- double xm = x * mom[0], ym = y * mom[0];
+ F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
+ dst_step /= sizeof(F);
// + m00 ( = m00' )
- dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+ *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
- dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+ *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
- dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+ *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
+ *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
+ *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
+ *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+ *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+ *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+ *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+ *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
-}
-//#endif
+}
\ No newline at end of file
//
// @Authors
// Zhang Chunpeng chunpeng@multicorewareinc.com
+// Dachuan Zhao, dachuan@multicorewareinc.com
+// Yao Wang, yao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
////////////////////////// CV_8UC1 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
- int srcRows,int dstRows,int srcCols,int dstCols,
- int srcOffset,int dstOffset,int srcStep,int dstStep)
+ int srcRows,int dstRows,int srcCols,int dstCols,
+ int srcOffset,int dstOffset,int srcStep,int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
-
__local float s_srcPatch[10][10];
__local float s_dstPatch[20][16];
+ const int tidx = get_local_id(0);
+ const int tidy = get_local_id(1);
+ const int lsizex = get_local_size(0);
+ const int lsizey = get_local_size(1);
-
- if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+ if( tidx < 10 && tidy < 10 )
{
- int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
- int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+ int srcx = mad24((int)get_group_id(0), (lsizex>>1), tidx) - 1;
+ int srcy = mad24((int)get_group_id(1), (lsizey>>1), tidy) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
- s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+ s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum = 0;
- const int evenFlag = (int)((get_local_id(0) & 1) == 0);
- const int oddFlag = (int)((get_local_id(0) & 1) != 0);
- const bool eveny = ((get_local_id(1) & 1) == 0);
- const int tidx = get_local_id(0);
+ const int evenFlag = (int)((tidx & 1) == 0);
+ const int oddFlag = (int)((tidx & 1) != 0);
+ const bool eveny = ((tidy & 1) == 0);
if(eveny)
{
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
- sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
+ sum = (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
if (eveny)
{
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
- sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
- }
+ sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
+ }
- s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
- }
+ s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+ }
- if (get_local_id(1) > 13)
- {
+ if (get_local_id(1) > 13)
+ {
sum = 0;
if (eveny)
{
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
- sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+ sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
- s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
- }
+ s_dstPatch[4 + tidy][tidx] = sum;
+ }
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0;
- const int tidy = get_local_id(1);
-
- sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
- sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)];
- sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)];
- sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)];
- sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+ sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
+ sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx];
+ sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx];
+ sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx];
+ sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = (float)(4.0f * sum);
////////////////////////// CV_16UC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
- int srcRows,int dstRows,int srcCols,int dstCols,
- int srcOffset,int dstOffset,int srcStep,int dstStep)
+ int srcRows,int dstRows,int srcCols,int dstCols,
+ int srcOffset,int dstOffset,int srcStep,int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
- }
+ }
- s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
- }
+ s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+ }
- if (get_local_id(1) > 13)
- {
+ if (get_local_id(1) > 13)
+ {
sum = 0;
if (eveny)
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
- }
+ }
barrier(CLK_LOCAL_MEM_FENCE);
////////////////////////// CV_32FC1 /////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
- int srcRows,int dstRows,int srcCols,int dstCols,
- int srcOffset,int dstOffset,int srcStep,int dstStep)
+ int srcRows,int dstRows,int srcCols,int dstCols,
+ int srcOffset,int dstOffset,int srcStep,int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
-
+ const int tidx = get_local_id(0);
+ const int tidy = get_local_id(1);
+ const int lsizex = get_local_size(0);
+ const int lsizey = get_local_size(1);
__local float s_srcPatch[10][10];
__local float s_dstPatch[20][16];
dstStep = dstStep >> 2;
- if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+ if( tidx < 10 && tidy < 10 )
{
- int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
- int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+ int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1;
+ int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
- s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+ s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum = 0;
- const int evenFlag = (int)((get_local_id(0) & 1) == 0);
- const int oddFlag = (int)((get_local_id(0) & 1) != 0);
- const bool eveny = ((get_local_id(1) & 1) == 0);
- const int tidx = get_local_id(0);
+ const int evenFlag = (int)((tidx & 1) == 0);
+ const int oddFlag = (int)((tidx & 1) != 0);
+ const bool eveny = ((tidy & 1) == 0);
+
if(eveny)
{
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
- sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
}
- s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+ s_dstPatch[2 + tidy][tidx] = sum;
- if (get_local_id(1) < 2)
+ if (tidy < 2)
{
sum = 0;
if (eveny)
{
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
- sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
- }
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
+ }
- s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
- }
+ s_dstPatch[tidy][tidx] = sum;
+ }
- if (get_local_id(1) > 13)
- {
+ if (tidy > 13)
+ {
sum = 0;
if (eveny)
{
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
- sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)];
- sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
- sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx + 2) >> 1)];
}
- s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
- }
+ s_dstPatch[4 + tidy][tidx] = sum;
+ }
barrier(CLK_LOCAL_MEM_FENCE);
- sum = 0;
-
- const int tidy = get_local_id(1);
-
- sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
- sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)];
- sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)];
- sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)];
- sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+ sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
+ sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx];
+ sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx];
+ sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx];
+ sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = (float)(4.0f * sum);
return u4Data;
}
-float4 int_x_float4(int leftOpr,float4 rightOpr)
-{
- float4 result = {0,0,0,0};
-
- result.x = rightOpr.x * leftOpr;
- result.y = rightOpr.y * leftOpr;
- result.z = rightOpr.z * leftOpr;
- result.w = rightOpr.w * leftOpr;
-
- return result;
-}
-
-float4 float4_x_float4(float4 leftOpr,float4 rightOpr)
-{
- float4 result;
-
- result.x = leftOpr.x * rightOpr.x;
- result.y = leftOpr.y * rightOpr.y;
- result.z = leftOpr.z * rightOpr.z;
- result.w = leftOpr.w * rightOpr.w;
-
- return result;
-}
-
__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
- int srcRows,int dstRows,int srcCols,int dstCols,
- int srcOffset,int dstOffset,int srcStep,int dstStep)
+ int srcRows,int dstRows,int srcCols,int dstCols,
+ int srcOffset,int dstOffset,int srcStep,int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
-
+ const int tidx = get_local_id(0);
+ const int tidy = get_local_id(1);
+ const int lsizex = get_local_size(0);
+ const int lsizey = get_local_size(1);
__local float4 s_srcPatch[10][10];
__local float4 s_dstPatch[20][16];
dstStep >>= 2;
- if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+ if( tidx < 10 && tidy < 10 )
{
- int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
- int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+ int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1;
+ int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
- s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
+ s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = (float4)(0,0,0,0);
- const int evenFlag = (int)((get_local_id(0) & 1) == 0);
- const int oddFlag = (int)((get_local_id(0) & 1) != 0);
- const bool eveny = ((get_local_id(1) & 1) == 0);
- const int tidx = get_local_id(0);
+ const int evenFlag = (int)((tidx & 1) == 0);
+ const int oddFlag = (int)((tidx & 1) != 0);
+ const bool eveny = ((tidy & 1) == 0);
float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
if(eveny)
{
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);
+ sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+ sum = sum + ( evenFlag * co1) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+ sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
}
- s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+ s_dstPatch[2 + tidy][tidx] = sum;
- if (get_local_id(1) < 2)
+ if (tidy < 2)
{
sum = 0;
if (eveny)
{
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
- }
-
- s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
- }
-
- if (get_local_id(1) > 13)
- {
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
+ }
+
+ s_dstPatch[tidy][tidx] = sum;
+ }
+
+ if (tidy > 13)
+ {
sum = 0;
if (eveny)
{
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];
}
- s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
- }
+ s_dstPatch[4 + tidy][tidx] = sum;
+ }
barrier(CLK_LOCAL_MEM_FENCE);
- sum = 0;
-
- const int tidy = get_local_id(1);
-
- sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
- sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
- sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]);
- sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
- sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+ sum = co3 * s_dstPatch[2 + tidy - 2][tidx];
+ sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
+ sum = sum + co1 * s_dstPatch[2 + tidy ][tidx];
+ sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
+ sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
{
- dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum));
+ dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum);
}
}
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
- int srcRows,int dstRows,int srcCols,int dstCols,
- int srcOffset,int dstOffset,int srcStep,int dstStep)
+ int srcRows,int dstRows,int srcCols,int dstCols,
+ int srcOffset,int dstOffset,int srcStep,int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if(eveny)
{
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);
+ sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+ sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+ sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
}
if (eveny)
{
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
- }
-
- s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
- }
-
- if (get_local_id(1) > 13)
- {
+ sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+ }
+
+ s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+ }
+
+ if (get_local_id(1) > 13)
+ {
sum = 0;
if (eveny)
{
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+ sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
- }
+ }
barrier(CLK_LOCAL_MEM_FENCE);
const int tidy = get_local_id(1);
- sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
- sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
- sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]);
- sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
- sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+ sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+ sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+ sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)];
+ sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+ sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
{
- dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum));
+ dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum);
}
}
////////////////////////// CV_32FC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
- int srcRows,int dstRows,int srcCols,int dstCols,
- int srcOffset,int dstOffset,int srcStep,int dstStep)
+ int srcRows,int dstRows,int srcCols,int dstCols,
+ int srcOffset,int dstOffset,int srcStep,int dstStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
-
+ const int tidx = get_local_id(0);
+ const int tidy = get_local_id(1);
+ const int lsizex = get_local_size(0);
+ const int lsizey = get_local_size(1);
__local float4 s_srcPatch[10][10];
__local float4 s_dstPatch[20][16];
dstStep >>= 4;
- if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+ if( tidx < 10 && tidy < 10 )
{
- int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
- int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+ int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + tidx) - 1;
+ int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + tidy) - 1;
srcx = abs(srcx);
srcx = min(srcCols - 1,srcx);
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
- s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]);
+ s_srcPatch[tidy][tidx] = (float4)(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 sum = (float4)(0,0,0,0);
- const int evenFlag = (int)((get_local_id(0) & 1) == 0);
- const int oddFlag = (int)((get_local_id(0) & 1) != 0);
- const bool eveny = ((get_local_id(1) & 1) == 0);
- const int tidx = get_local_id(0);
+ const int evenFlag = (int)((tidx & 1) == 0);
+ const int oddFlag = (int)((tidx & 1) != 0);
+ const bool eveny = ((tidy & 1) == 0);
float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
if(eveny)
{
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);
+ sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+ sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+ sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
}
- s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+ s_dstPatch[2 + tidy][tidx] = sum;
- if (get_local_id(1) < 2)
+ if (tidy < 2)
{
sum = 0;
if (eveny)
{
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
- }
-
- s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
- }
-
- if (get_local_id(1) > 13)
- {
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
+ }
+
+ s_dstPatch[tidy][tidx] = sum;
+ }
+
+ if (tidy > 13)
+ {
sum = 0;
if (eveny)
{
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx ) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
- sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
+ sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)];
+ sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
+ sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];
}
- s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
- }
+ s_dstPatch[4 + tidy][tidx] = sum;
+ }
barrier(CLK_LOCAL_MEM_FENCE);
- sum = 0;
-
- const int tidy = get_local_id(1);
-
- sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
- sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
- sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy ][get_local_id(0)]);
- sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
- sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+ sum = co3 * s_dstPatch[2 + tidy - 2][tidx];
+ sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
+ sum = sum + co1 * s_dstPatch[2 + tidy ][tidx];
+ sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
+ sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
{