1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Sen Liu, swjtuls1987@126.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other oclMaterials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
46 #if defined (DOUBLE_SUPPORT)
49 #pragma OPENCL EXTENSION cl_khr_fp64:enable
50 #elif defined (cl_amd_fp64)
51 #pragma OPENCL EXTENSION cl_amd_fp64:enable
56 #define convert_F4 convert_double4
62 #define convert_F4 convert_float4
76 __kernel void icvContourMoments(int contour_total,
77 __global float* reader_oclmat_data,
81 T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
82 int idx = get_global_id(0);
84 if (idx < 0 || idx >= contour_total)
87 xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
88 yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
92 if(idx == contour_total - 1)
94 xi = (T)(*(reader_oclmat_data));
95 yi = (T)(*(reader_oclmat_data + 1));
99 xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
100 yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
105 dxy = xi_1 * yi - xi * yi_1;
109 dst_step /= sizeof(T);
110 *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
111 *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
112 *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
113 *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
114 *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
115 *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
116 *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
117 *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
118 *( dst_a + DST_ROW_21 * dst_step + idx) =
119 dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
120 xi2 * (yi_1 + 3 * yi));
121 *( dst_a + DST_ROW_12 * dst_step + idx) =
122 dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
123 yi2 * (xi_1 + 3 * xi));
126 __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
127 __global F* sum, __global F* dst_m, int dst_step)
129 int gidy = get_global_id(0);
130 int gidx = get_global_id(1);
131 int block_y = src_rows/tile_height;
132 int block_x = src_cols/tile_width;
135 if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
137 if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
139 block_num = block_y * block_x;
140 __local F dst_sum[10][128];
141 if(gidy<128-block_num)
142 for(int i=0; i<10; i++)
143 dst_sum[i][gidy+block_num]=0;
144 barrier(CLK_LOCAL_MEM_FENCE);
146 dst_step /= sizeof(F);
149 dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
150 dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
151 dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
152 dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
153 dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
154 dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
155 dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
156 dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
157 dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
158 dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
160 barrier(CLK_LOCAL_MEM_FENCE);
161 for(int lsize=64; lsize>0; lsize>>=1)
165 int lsize2 = gidy + lsize;
166 for(int i=0; i<10; i++)
167 dst_sum[i][gidy] += dst_sum[i][lsize2];
169 barrier(CLK_LOCAL_MEM_FENCE);
172 for(int i=0; i<10; i++)
173 sum[i] = dst_sum[i][0];
176 __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
178 int dst_cols, int dst_step, int blocky,
179 int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
181 uchar tmp_coi[16]; // get the coi data
183 int VLEN_C = 16; // vector length of uchar
185 int gidy = get_global_id(0);
186 int gidx = get_global_id(1);
187 int wgidy = get_group_id(0);
188 int wgidx = get_group_id(1);
189 int lidy = get_local_id(0);
190 int lidx = get_local_id(1);
191 int y = wgidy*TILE_SIZE; // vector length of uchar
192 int x = wgidx*TILE_SIZE; // vector length of uchar
193 int kcn = (cn==2)?2:4;
194 int rstep = min(src_step, TILE_SIZE);
195 tileSize_height = min(TILE_SIZE, src_rows - y);
196 tileSize_width = min(TILE_SIZE, src_cols - x);
198 if( tileSize_width < TILE_SIZE )
199 for(int i = tileSize_width; i < rstep; i++ )
200 *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
201 if( coi > 0 ) //channel of interest
202 for(int i = 0; i < tileSize_width; i += VLEN_C)
204 for(int j=0; j<VLEN_C; j++)
205 tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
206 tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
207 tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
210 for(int i=0; i < tileSize_width; i+=VLEN_C)
211 tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
212 uchar16 zero = (uchar16)(0);
213 uchar16 full = (uchar16)(255);
215 for(int i=0; i < tileSize_width; i+=VLEN_C)
216 tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
218 __local int m[10][128];
220 for(int i=0; i<10; i++)
221 for(int j=0; j<128; j++)
223 barrier(CLK_LOCAL_MEM_FENCE);
225 int16 x0 = (int16)(0);
226 int16 x1 = (int16)(0);
227 int16 x2 = (int16)(0);
228 int16 x3 = (int16)(0);
229 for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
231 int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
232 int16 p = convert_int16(tmp[xt/VLEN_C]);
233 int16 xp = v_xt * p, xxp = xp *v_xt;
239 x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
240 x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
241 x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
242 x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
243 int py = lidy * ((int)x0.s0);
245 int bheight = min(tileSize_height, TILE_SIZE/2);
246 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
248 m[9][lidy-bheight] = ((int)py) * sy; // m03
249 m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12
250 m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21
251 m[6][lidy-bheight] = x3.s0; // m30
252 m[5][lidy-bheight] = x0.s0 * sy; // m02
253 m[4][lidy-bheight] = x1.s0 * lidy; // m11
254 m[3][lidy-bheight] = x2.s0; // m20
255 m[2][lidy-bheight] = py; // m01
256 m[1][lidy-bheight] = x1.s0; // m10
257 m[0][lidy-bheight] = x0.s0; // m00
259 else if(lidy < bheight)
261 lm[9] = ((int)py) * sy; // m03
262 lm[8] = ((int)x1.s0) * sy; // m12
263 lm[7] = ((int)x2.s0) * lidy; // m21
264 lm[6] = x3.s0; // m30
265 lm[5] = x0.s0 * sy; // m02
266 lm[4] = x1.s0 * lidy; // m11
267 lm[3] = x2.s0; // m20
269 lm[1] = x1.s0; // m10
270 lm[0] = x0.s0; // m00
272 barrier(CLK_LOCAL_MEM_FENCE);
273 for( int j = bheight; j >= 1; j = j/2 )
276 for( int i = 0; i < 10; i++ )
277 lm[i] = lm[i] + m[i][lidy];
278 barrier(CLK_LOCAL_MEM_FENCE);
279 if(lidy >= j/2&&lidy < j)
280 for( int i = 0; i < 10; i++ )
281 m[i][lidy-j/2] = lm[i];
282 barrier(CLK_LOCAL_MEM_FENCE);
284 if(lidy == 0&&lidx == 0)
286 for( int mt = 0; mt < 10; mt++ )
291 for( int mt = 0; mt < 10; mt++ )
294 F xm = x * mom[0], ym = y * mom[0];
296 // accumulate moments computed in each tile
297 dst_step /= sizeof(F);
300 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
302 // + m10 ( = m10' + x*m00' )
303 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
305 // + m01 ( = m01' + y*m00' )
306 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
308 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
309 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
311 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
312 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
314 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
315 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
317 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
318 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
320 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
321 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
323 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
324 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
326 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
327 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
331 __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
333 int dst_cols, int dst_step, int blocky,
334 int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
336 ushort tmp_coi[8]; // get the coi data
338 int VLEN_US = 8; // vector length of ushort
339 int gidy = get_global_id(0);
340 int gidx = get_global_id(1);
341 int wgidy = get_group_id(0);
342 int wgidx = get_group_id(1);
343 int lidy = get_local_id(0);
344 int lidx = get_local_id(1);
345 int y = wgidy*TILE_SIZE; // real Y index of pixel
346 int x = wgidx*TILE_SIZE; // real X index of pixel
347 int kcn = (cn==2)?2:4;
348 int rstep = min(src_step/2, TILE_SIZE);
349 tileSize_height = min(TILE_SIZE, src_rows - y);
350 tileSize_width = min(TILE_SIZE, src_cols -x);
351 if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
352 for(int i=tileSize_width; i < rstep; i++ )
353 *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
355 for(int i=0; i < tileSize_width; i+=VLEN_US)
357 for(int j=0; j<VLEN_US; j++)
358 tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
359 tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
362 for(int i=0; i < tileSize_width; i+=VLEN_US)
363 tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
364 ushort8 zero = (ushort8)(0);
365 ushort8 full = (ushort8)(255);
367 for(int i=0; i < tileSize_width; i+=VLEN_US)
368 tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
370 __local long m[10][128];
372 for(int i=0; i<10; i++)
373 for(int j=0; j<128; j++)
375 barrier(CLK_LOCAL_MEM_FENCE);
380 long8 x3 = (long8)(0);
381 for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
383 int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
384 int8 p = convert_int8(tmp[xt/VLEN_US]);
385 int8 xp = v_xt * p, xxp = xp * v_xt;
389 x3 += convert_long8(xxp) *convert_long8(v_xt);
391 x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
392 x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
393 x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
394 x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
396 int py = lidy * x0.s0, sy = lidy*lidy;
397 int bheight = min(tileSize_height, TILE_SIZE/2);
398 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
400 m[9][lidy-bheight] = ((long)py) * sy; // m03
401 m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
402 m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
403 m[6][lidy-bheight] = x3.s0; // m30
404 m[5][lidy-bheight] = x0.s0 * sy; // m02
405 m[4][lidy-bheight] = x1.s0 * lidy; // m11
406 m[3][lidy-bheight] = x2.s0; // m20
407 m[2][lidy-bheight] = py; // m01
408 m[1][lidy-bheight] = x1.s0; // m10
409 m[0][lidy-bheight] = x0.s0; // m00
411 else if(lidy < bheight)
413 lm[9] = ((long)py) * sy; // m03
414 lm[8] = ((long)x1.s0) * sy; // m12
415 lm[7] = ((long)x2.s0) * lidy; // m21
416 lm[6] = x3.s0; // m30
417 lm[5] = x0.s0 * sy; // m02
418 lm[4] = x1.s0 * lidy; // m11
419 lm[3] = x2.s0; // m20
421 lm[1] = x1.s0; // m10
422 lm[0] = x0.s0; // m00
424 barrier(CLK_LOCAL_MEM_FENCE);
425 for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
428 for( int i = 0; i < 10; i++ )
429 lm[i] = lm[i] + m[i][lidy];
430 barrier(CLK_LOCAL_MEM_FENCE);
431 if(lidy >= j/2&&lidy < j)
432 for( int i = 0; i < 10; i++ )
433 m[i][lidy-j/2] = lm[i];
434 barrier(CLK_LOCAL_MEM_FENCE);
436 if(lidy == 0&&lidx == 0)
438 for(int mt = 0; mt < 10; mt++ )
444 for( int mt = 0; mt < 10; mt++ )
448 F xm = x *mom[0], ym = y * mom[0];
450 // accumulate moments computed in each tile
451 dst_step /= sizeof(F);
454 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
456 // + m10 ( = m10' + x*m00' )
457 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
459 // + m01 ( = m01' + y*m00' )
460 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
462 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
463 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
465 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
466 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
468 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
469 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
471 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
472 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
474 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
475 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
477 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
478 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
480 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
481 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
485 __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
487 int dst_cols, int dst_step, int blocky,
488 int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
490 short tmp_coi[8]; // get the coi data
492 int VLEN_S =8; // vector length of short
493 int gidy = get_global_id(0);
494 int gidx = get_global_id(1);
495 int wgidy = get_group_id(0);
496 int wgidx = get_group_id(1);
497 int lidy = get_local_id(0);
498 int lidx = get_local_id(1);
499 int y = wgidy*TILE_SIZE; // real Y index of pixel
500 int x = wgidx*TILE_SIZE; // real X index of pixel
501 int kcn = (cn==2)?2:4;
502 int rstep = min(src_step/2, TILE_SIZE);
503 tileSize_height = min(TILE_SIZE, src_rows - y);
504 tileSize_width = min(TILE_SIZE, src_cols -x);
505 if(tileSize_width < TILE_SIZE)
506 for(int i = tileSize_width; i < rstep; i++ )
507 *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
509 for(int i=0; i < tileSize_width; i+=VLEN_S)
511 for(int j=0; j<VLEN_S; j++)
512 tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
513 tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
516 for(int i=0; i < tileSize_width; i+=VLEN_S)
517 tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
518 short8 zero = (short8)(0);
519 short8 full = (short8)(255);
521 for(int i=0; i < tileSize_width; i+=(VLEN_S))
522 tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
525 __local long m[10][128];
527 for(int i=0; i<10; i++)
528 for(int j=0; j<128; j++)
530 barrier(CLK_LOCAL_MEM_FENCE);
535 long8 x3 = (long8)(0);
536 for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
538 int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
539 int8 p = convert_int8(tmp[xt/VLEN_S]);
540 int8 xp = v_xt * p, xxp = xp * v_xt;
544 x3 += convert_long8(xxp) * convert_long8(v_xt);
546 x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
547 x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
548 x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
549 x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
551 int py = lidy * x0.s0, sy = lidy*lidy;
552 int bheight = min(tileSize_height, TILE_SIZE/2);
553 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
555 m[9][lidy-bheight] = ((long)py) * sy; // m03
556 m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
557 m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
558 m[6][lidy-bheight] = x3.s0; // m30
559 m[5][lidy-bheight] = x0.s0 * sy; // m02
560 m[4][lidy-bheight] = x1.s0 * lidy; // m11
561 m[3][lidy-bheight] = x2.s0; // m20
562 m[2][lidy-bheight] = py; // m01
563 m[1][lidy-bheight] = x1.s0; // m10
564 m[0][lidy-bheight] = x0.s0; // m00
566 else if(lidy < bheight)
568 lm[9] = ((long)py) * sy; // m03
569 lm[8] = ((long)(x1.s0)) * sy; // m12
570 lm[7] = ((long)(x2.s0)) * lidy; // m21
571 lm[6] = x3.s0; // m30
572 lm[5] = x0.s0 * sy; // m02
573 lm[4] = x1.s0 * lidy; // m11
574 lm[3] = x2.s0; // m20
576 lm[1] = x1.s0; // m10
577 lm[0] = x0.s0; // m00
579 barrier(CLK_LOCAL_MEM_FENCE);
580 for( int j = TILE_SIZE/2; j >=1; j = j/2 )
583 for( int i = 0; i < 10; i++ )
584 lm[i] = lm[i] + m[i][lidy];
585 barrier(CLK_LOCAL_MEM_FENCE);
586 if(lidy >= j/2&&lidy < j)
587 for( int i = 0; i < 10; i++ )
588 m[i][lidy-j/2] = lm[i];
589 barrier(CLK_LOCAL_MEM_FENCE);
591 if(lidy ==0 &&lidx ==0)
593 for(int mt = 0; mt < 10; mt++ )
599 for( int mt = 0; mt < 10; mt++ )
603 F xm = x * mom[0], ym = y*mom[0];
605 // accumulate moments computed in each tile
606 dst_step /= sizeof(F);
609 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
611 // + m10 ( = m10' + x*m00' )
612 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
614 // + m01 ( = m01' + y*m00' )
615 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
617 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
618 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
620 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
621 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
623 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
624 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
626 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
627 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
629 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
630 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
632 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
633 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
635 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
636 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
640 __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
642 int dst_cols, int dst_step, int blocky,
643 int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
645 float tmp_coi[4]; // get the coi data
647 int VLEN_F = 4; // vector length of float
648 int gidy = get_global_id(0);
649 int gidx = get_global_id(1);
650 int wgidy = get_group_id(0);
651 int wgidx = get_group_id(1);
652 int lidy = get_local_id(0);
653 int lidx = get_local_id(1);
654 int y = wgidy*TILE_SIZE; // real Y index of pixel
655 int x = wgidx*TILE_SIZE; // real X index of pixel
656 int kcn = (cn==2)?2:4;
657 src_step /= sizeof(*src_data);
658 int rstep = min(src_step, TILE_SIZE);
659 tileSize_height = min(TILE_SIZE, src_rows - y);
660 tileSize_width = min(TILE_SIZE, src_cols -x);
661 int maxIdx = mul24(src_rows, src_cols);
662 int yOff = (y+lidy)*src_step;
664 if(tileSize_width < TILE_SIZE && yOff < src_rows)
665 for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
666 *(src_data+yOff+x+i) = 0;
668 for(int i=0; i < tileSize_width; i+=VLEN_F)
671 for(int j=0; j<4; j++)
673 index = yOff+(x+i+j)*kcn+coi-1;
675 tmp_coi[j] = *(src_data+index);
679 tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
682 for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
683 tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
684 float4 zero = (float4)(0);
685 float4 full = (float4)(255);
687 for(int i=0; i < tileSize_width; i+=4)
688 tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
690 __local F m[10][128];
692 for(int i = 0; i < 10; i ++)
693 for(int j = 0; j < 128; j ++)
695 barrier(CLK_LOCAL_MEM_FENCE);
701 for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
703 F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
704 F4 p = convert_F4(tmp[xt/VLEN_F]);
705 F4 xp = v_xt * p, xxp = xp * v_xt;
711 x0.s0 += x0.s1 + x0.s2 + x0.s3;
712 x1.s0 += x1.s1 + x1.s2 + x1.s3;
713 x2.s0 += x2.s1 + x2.s2 + x2.s3;
714 x3.s0 += x3.s1 + x3.s2 + x3.s3;
716 F py = lidy * x0.s0, sy = lidy*lidy;
717 int bheight = min(tileSize_height, TILE_SIZE/2);
718 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
720 m[9][lidy-bheight] = ((F)py) * sy; // m03
721 m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
722 m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
723 m[6][lidy-bheight] = x3.s0; // m30
724 m[5][lidy-bheight] = x0.s0 * sy; // m02
725 m[4][lidy-bheight] = x1.s0 * lidy; // m11
726 m[3][lidy-bheight] = x2.s0; // m20
727 m[2][lidy-bheight] = py; // m01
728 m[1][lidy-bheight] = x1.s0; // m10
729 m[0][lidy-bheight] = x0.s0; // m00
732 else if(lidy < bheight)
734 lm[9] = ((F)py) * sy; // m03
735 lm[8] = ((F)x1.s0) * sy; // m12
736 lm[7] = ((F)x2.s0) * lidy; // m21
737 lm[6] = x3.s0; // m30
738 lm[5] = x0.s0 * sy; // m02
739 lm[4] = x1.s0 * lidy; // m11
740 lm[3] = x2.s0; // m20
742 lm[1] = x1.s0; // m10
743 lm[0] = x0.s0; // m00
745 barrier(CLK_LOCAL_MEM_FENCE);
746 for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
749 for( int i = 0; i < 10; i++ )
750 lm[i] = lm[i] + m[i][lidy];
751 barrier(CLK_LOCAL_MEM_FENCE);
752 if(lidy >= j/2&&lidy < j)
753 for( int i = 0; i < 10; i++ )
754 m[i][lidy-j/2] = lm[i];
755 barrier(CLK_LOCAL_MEM_FENCE);
757 if(lidy == 0&&lidx == 0)
759 for( int mt = 0; mt < 10; mt++ )
764 for( int mt = 0; mt < 10; mt++ )
768 F xm = x * mom[0], ym = y * mom[0];
770 // accumulate moments computed in each tile
771 dst_step /= sizeof(F);
773 int dst_x_off = mad24(wgidy, dst_cols, wgidx);
775 int max_dst_index = 10 * blocky * get_global_size(1);
778 dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
779 if (dst_off < max_dst_index)
780 *(dst_m + dst_off) = mom[0];
782 // + m10 ( = m10' + x*m00' )
783 dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
784 if (dst_off < max_dst_index)
785 *(dst_m + dst_off) = mom[1] + xm;
787 // + m01 ( = m01' + y*m00' )
788 dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
789 if (dst_off < max_dst_index)
790 *(dst_m + dst_off) = mom[2] + ym;
792 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
793 dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
794 if (dst_off < max_dst_index)
795 *(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
797 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
798 dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
799 if (dst_off < max_dst_index)
800 *(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
802 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
803 dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
804 if (dst_off < max_dst_index)
805 *(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
807 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
808 dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
809 if (dst_off < max_dst_index)
810 *(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
812 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
813 dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
814 if (dst_off < max_dst_index)
815 *(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
817 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
818 dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
819 if (dst_off < max_dst_index)
820 *(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
822 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
823 dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
824 if (dst_off < max_dst_index)
825 *(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
829 __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
831 int dst_cols, int dst_step, int blocky,
832 int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
834 F tmp_coi[4]; // get the coi data
836 int VLEN_D = 4; // length of vetor
837 int gidy = get_global_id(0);
838 int gidx = get_global_id(1);
839 int wgidy = get_group_id(0);
840 int wgidx = get_group_id(1);
841 int lidy = get_local_id(0);
842 int lidx = get_local_id(1);
843 int y = wgidy*TILE_SIZE; // real Y index of pixel
844 int x = wgidx*TILE_SIZE; // real X index of pixel
845 int kcn = (cn==2)?2:4;
846 int rstep = min(src_step/8, TILE_SIZE);
847 tileSize_height = min(TILE_SIZE, src_rows - y);
848 tileSize_width = min(TILE_SIZE, src_cols - x);
850 if(tileSize_width < TILE_SIZE)
851 for(int i = tileSize_width; i < rstep; i++ )
852 *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
854 for(int i=0; i < tileSize_width; i+=VLEN_D)
856 for(int j=0; j<4; j++)
857 tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
858 tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
861 for(int i=0; i < tileSize_width; i+=VLEN_D)
862 tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
866 for(int i=0; i < tileSize_width; i+=VLEN_D)
867 tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
869 __local F m[10][128];
871 for(int i=0; i<10; i++)
872 for(int j=0; j<128; j++)
874 barrier(CLK_LOCAL_MEM_FENCE);
880 for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
882 F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
883 F4 p = tmp[xt/VLEN_D];
884 F4 xp = v_xt * p, xxp = xp * v_xt;
890 x0.s0 += x0.s1 + x0.s2 + x0.s3;
891 x1.s0 += x1.s1 + x1.s2 + x1.s3;
892 x2.s0 += x2.s1 + x2.s2 + x2.s3;
893 x3.s0 += x3.s1 + x3.s2 + x3.s3;
895 F py = lidy * x0.s0, sy = lidy*lidy;
896 int bheight = min(tileSize_height, TILE_SIZE/2);
897 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
899 m[9][lidy-bheight] = ((F)py) * sy; // m03
900 m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
901 m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
902 m[6][lidy-bheight] = x3.s0; // m30
903 m[5][lidy-bheight] = x0.s0 * sy; // m02
904 m[4][lidy-bheight] = x1.s0 * lidy; // m11
905 m[3][lidy-bheight] = x2.s0; // m20
906 m[2][lidy-bheight] = py; // m01
907 m[1][lidy-bheight] = x1.s0; // m10
908 m[0][lidy-bheight] = x0.s0; // m00
911 else if(lidy < bheight)
913 lm[9] = ((F)py) * sy; // m03
914 lm[8] = ((F)x1.s0) * sy; // m12
915 lm[7] = ((F)x2.s0) * lidy; // m21
916 lm[6] = x3.s0; // m30
917 lm[5] = x0.s0 * sy; // m02
918 lm[4] = x1.s0 * lidy; // m11
919 lm[3] = x2.s0; // m20
921 lm[1] = x1.s0; // m10
922 lm[0] = x0.s0; // m00
924 barrier(CLK_LOCAL_MEM_FENCE);
925 for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
928 for( int i = 0; i < 10; i++ )
929 lm[i] = lm[i] + m[i][lidy];
930 barrier(CLK_LOCAL_MEM_FENCE);
931 if(lidy >= j/2&&lidy < j)
932 for( int i = 0; i < 10; i++ )
933 m[i][lidy-j/2] = lm[i];
934 barrier(CLK_LOCAL_MEM_FENCE);
936 if(lidy == 0&&lidx == 0)
938 for( int mt = 0; mt < 10; mt++ )
943 for( int mt = 0; mt < 10; mt++ )
947 F xm = x * mom[0], ym = y * mom[0];
949 // accumulate moments computed in each tile
950 dst_step /= sizeof(F);
953 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
955 // + m10 ( = m10' + x*m00' )
956 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
958 // + m01 ( = m01' + y*m00' )
959 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
961 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
962 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
964 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
965 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
967 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
968 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
970 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
971 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
973 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
974 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
976 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
977 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
979 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
980 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));