1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Sen Liu, swjtuls1987@126.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other oclMaterials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
46 #if defined (DOUBLE_SUPPORT)
49 #pragma OPENCL EXTENSION cl_khr_fp64:enable
50 #elif defined (cl_amd_fp64)
51 #pragma OPENCL EXTENSION cl_amd_fp64:enable
56 #define convert_F4 convert_double4
62 #define convert_F4 convert_float4
76 __kernel void icvContourMoments(int contour_total,
77 __global float* reader_oclmat_data,
81 T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
82 int idx = get_global_id(0);
84 if (idx < 0 || idx >= contour_total)
87 xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
88 yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
92 if(idx == contour_total - 1)
94 xi = (T)(*(reader_oclmat_data));
95 yi = (T)(*(reader_oclmat_data + 1));
99 xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
100 yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
105 dxy = xi_1 * yi - xi * yi_1;
109 dst_step /= sizeof(T);
110 *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
111 *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
112 *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
113 *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
114 *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
115 *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
116 *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
117 *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
118 *( dst_a + DST_ROW_21 * dst_step + idx) =
119 dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
120 xi2 * (yi_1 + 3 * yi));
121 *( dst_a + DST_ROW_12 * dst_step + idx) =
122 dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
123 yi2 * (xi_1 + 3 * xi));
126 __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
127 __global F* sum, __global F* dst_m, int dst_step)
129 int gidy = get_global_id(0);
130 int gidx = get_global_id(1);
131 int block_y = src_rows/tile_height;
132 int block_x = src_cols/tile_width;
135 if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
137 if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
139 block_num = block_y * block_x;
140 __local F dst_sum[10][128];
141 if(gidy<128-block_num)
142 for(int i=0; i<10; i++)
143 dst_sum[i][gidy+block_num]=0;
144 barrier(CLK_LOCAL_MEM_FENCE);
146 dst_step /= sizeof(F);
149 dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
150 dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
151 dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
152 dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
153 dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
154 dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
155 dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
156 dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
157 dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
158 dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
160 barrier(CLK_LOCAL_MEM_FENCE);
161 for(int lsize=64; lsize>0; lsize>>=1)
165 int lsize2 = gidy + lsize;
166 for(int i=0; i<10; i++)
167 dst_sum[i][gidy] += dst_sum[i][lsize2];
169 barrier(CLK_LOCAL_MEM_FENCE);
172 for(int i=0; i<10; i++)
173 sum[i] = dst_sum[i][0];
176 __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
178 int dst_cols, int dst_step, int blocky,
179 int depth, int cn, int coi, int binary, int TILE_SIZE)
181 uchar tmp_coi[16]; // get the coi data
183 int VLEN_C = 16; // vector length of uchar
185 int gidy = get_global_id(0);
186 int gidx = get_global_id(1);
187 int wgidy = get_group_id(0);
188 int wgidx = get_group_id(1);
189 int lidy = get_local_id(0);
190 int lidx = get_local_id(1);
191 int y = wgidy*TILE_SIZE; // vector length of uchar
192 int x = wgidx*TILE_SIZE; // vector length of uchar
193 int kcn = (cn==2)?2:4;
194 int rstep = min(src_step, TILE_SIZE);
195 int tileSize_height = min(TILE_SIZE, src_rows - y);
196 int tileSize_width = min(TILE_SIZE, src_cols - x);
198 if ( y+lidy < src_rows )
200 if( tileSize_width < TILE_SIZE )
201 for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
202 *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
204 if( coi > 0 ) //channel of interest
205 for(int i = 0; i < tileSize_width; i += VLEN_C)
207 for(int j=0; j<VLEN_C; j++)
208 tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
209 tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
210 tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
213 for(int i=0; i < tileSize_width; i+=VLEN_C)
214 tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
217 uchar16 zero = (uchar16)(0);
218 uchar16 full = (uchar16)(255);
220 for(int i=0; i < tileSize_width; i+=VLEN_C)
221 tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
224 __local int m[10][128];
227 for(int i=0; i<10; i++)
230 barrier(CLK_LOCAL_MEM_FENCE);
233 int16 x0 = (int16)(0);
234 int16 x1 = (int16)(0);
235 int16 x2 = (int16)(0);
236 int16 x3 = (int16)(0);
237 for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
239 int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
240 int16 p = convert_int16(tmp[xt/VLEN_C]);
241 int16 xp = v_xt * p, xxp = xp *v_xt;
247 x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
248 x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
249 x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
250 x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
251 int py = lidy * ((int)x0.s0);
253 int bheight = min(tileSize_height, TILE_SIZE/2);
254 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
256 m[9][lidy-bheight] = ((int)py) * sy; // m03
257 m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12
258 m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21
259 m[6][lidy-bheight] = x3.s0; // m30
260 m[5][lidy-bheight] = x0.s0 * sy; // m02
261 m[4][lidy-bheight] = x1.s0 * lidy; // m11
262 m[3][lidy-bheight] = x2.s0; // m20
263 m[2][lidy-bheight] = py; // m01
264 m[1][lidy-bheight] = x1.s0; // m10
265 m[0][lidy-bheight] = x0.s0; // m00
267 else if(lidy < bheight)
269 lm[9] = ((int)py) * sy; // m03
270 lm[8] = ((int)x1.s0) * sy; // m12
271 lm[7] = ((int)x2.s0) * lidy; // m21
272 lm[6] = x3.s0; // m30
273 lm[5] = x0.s0 * sy; // m02
274 lm[4] = x1.s0 * lidy; // m11
275 lm[3] = x2.s0; // m20
277 lm[1] = x1.s0; // m10
278 lm[0] = x0.s0; // m00
280 barrier(CLK_LOCAL_MEM_FENCE);
281 for( int j = bheight; j >= 1; j = j/2 )
284 for( int i = 0; i < 10; i++ )
285 lm[i] = lm[i] + m[i][lidy];
286 barrier(CLK_LOCAL_MEM_FENCE);
287 if(lidy >= j/2&&lidy < j)
288 for( int i = 0; i < 10; i++ )
289 m[i][lidy-j/2] = lm[i];
290 barrier(CLK_LOCAL_MEM_FENCE);
293 if(lidy == 0&&lidx == 0)
295 for( int mt = 0; mt < 10; mt++ )
300 for( int mt = 0; mt < 10; mt++ )
303 F xm = x * mom[0], ym = y * mom[0];
305 // accumulate moments computed in each tile
306 dst_step /= sizeof(F);
309 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
311 // + m10 ( = m10' + x*m00' )
312 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
314 // + m01 ( = m01' + y*m00' )
315 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
317 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
318 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
320 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
321 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
323 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
324 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
326 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
327 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
329 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
330 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
332 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
333 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
335 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
336 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
340 __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
342 int dst_cols, int dst_step, int blocky,
343 int depth, int cn, int coi, int binary, const int TILE_SIZE)
345 ushort tmp_coi[8]; // get the coi data
347 int VLEN_US = 8; // vector length of ushort
348 int gidy = get_global_id(0);
349 int gidx = get_global_id(1);
350 int wgidy = get_group_id(0);
351 int wgidx = get_group_id(1);
352 int lidy = get_local_id(0);
353 int lidx = get_local_id(1);
354 int y = wgidy*TILE_SIZE; // real Y index of pixel
355 int x = wgidx*TILE_SIZE; // real X index of pixel
356 int kcn = (cn==2)?2:4;
357 int rstep = min(src_step/2, TILE_SIZE);
358 int tileSize_height = min(TILE_SIZE, src_rows - y);
359 int tileSize_width = min(TILE_SIZE, src_cols -x);
361 if ( y+lidy < src_rows )
363 if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
364 for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
365 *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
367 for(int i=0; i < tileSize_width; i+=VLEN_US)
369 for(int j=0; j<VLEN_US; j++)
370 tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
371 tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
374 for(int i=0; i < tileSize_width; i+=VLEN_US)
375 tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
378 ushort8 zero = (ushort8)(0);
379 ushort8 full = (ushort8)(255);
381 for(int i=0; i < tileSize_width; i+=VLEN_US)
382 tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
384 __local long m[10][128];
386 for(int i=0; i<10; i++)
388 barrier(CLK_LOCAL_MEM_FENCE);
394 long8 x3 = (long8)(0);
395 for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
397 int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
398 int8 p = convert_int8(tmp[xt/VLEN_US]);
399 int8 xp = v_xt * p, xxp = xp * v_xt;
403 x3 += convert_long8(xxp) *convert_long8(v_xt);
405 x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
406 x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
407 x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
408 x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
410 int py = lidy * x0.s0, sy = lidy*lidy;
411 int bheight = min(tileSize_height, TILE_SIZE/2);
412 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
414 m[9][lidy-bheight] = ((long)py) * sy; // m03
415 m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
416 m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
417 m[6][lidy-bheight] = x3.s0; // m30
418 m[5][lidy-bheight] = x0.s0 * sy; // m02
419 m[4][lidy-bheight] = x1.s0 * lidy; // m11
420 m[3][lidy-bheight] = x2.s0; // m20
421 m[2][lidy-bheight] = py; // m01
422 m[1][lidy-bheight] = x1.s0; // m10
423 m[0][lidy-bheight] = x0.s0; // m00
425 else if(lidy < bheight)
427 lm[9] = ((long)py) * sy; // m03
428 lm[8] = ((long)x1.s0) * sy; // m12
429 lm[7] = ((long)x2.s0) * lidy; // m21
430 lm[6] = x3.s0; // m30
431 lm[5] = x0.s0 * sy; // m02
432 lm[4] = x1.s0 * lidy; // m11
433 lm[3] = x2.s0; // m20
435 lm[1] = x1.s0; // m10
436 lm[0] = x0.s0; // m00
438 barrier(CLK_LOCAL_MEM_FENCE);
440 for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
443 for( int i = 0; i < 10; i++ )
444 lm[i] = lm[i] + m[i][lidy];
446 barrier(CLK_LOCAL_MEM_FENCE);
447 for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
449 if(lidy >= j/2&&lidy < j)
450 for( int i = 0; i < 10; i++ )
451 m[i][lidy-j/2] = lm[i];
453 barrier(CLK_LOCAL_MEM_FENCE);
455 if(lidy == 0&&lidx == 0)
457 for(int mt = 0; mt < 10; mt++ )
463 for( int mt = 0; mt < 10; mt++ )
467 F xm = x *mom[0], ym = y * mom[0];
469 // accumulate moments computed in each tile
470 dst_step /= sizeof(F);
473 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
475 // + m10 ( = m10' + x*m00' )
476 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
478 // + m01 ( = m01' + y*m00' )
479 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
481 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
482 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
484 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
485 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
487 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
488 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
490 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
491 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
493 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
494 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
496 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
497 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
499 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
500 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
504 __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
506 int dst_cols, int dst_step, int blocky,
507 int depth, int cn, int coi, int binary, const int TILE_SIZE)
509 short tmp_coi[8]; // get the coi data
511 int VLEN_S =8; // vector length of short
512 int gidy = get_global_id(0);
513 int gidx = get_global_id(1);
514 int wgidy = get_group_id(0);
515 int wgidx = get_group_id(1);
516 int lidy = get_local_id(0);
517 int lidx = get_local_id(1);
518 int y = wgidy*TILE_SIZE; // real Y index of pixel
519 int x = wgidx*TILE_SIZE; // real X index of pixel
520 int kcn = (cn==2)?2:4;
521 int rstep = min(src_step/2, TILE_SIZE);
522 int tileSize_height = min(TILE_SIZE, src_rows - y);
523 int tileSize_width = min(TILE_SIZE, src_cols -x);
525 if ( y+lidy < src_rows )
527 if(tileSize_width < TILE_SIZE)
528 for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
529 *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
531 for(int i=0; i < tileSize_width; i+=VLEN_S)
533 for(int j=0; j<VLEN_S; j++)
534 tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
535 tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
538 for(int i=0; i < tileSize_width; i+=VLEN_S)
539 tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
542 short8 zero = (short8)(0);
543 short8 full = (short8)(255);
545 for(int i=0; i < tileSize_width; i+=(VLEN_S))
546 tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
549 __local long m[10][128];
551 for(int i=0; i<10; i++)
553 barrier(CLK_LOCAL_MEM_FENCE);
558 long8 x3 = (long8)(0);
559 for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
561 int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
562 int8 p = convert_int8(tmp[xt/VLEN_S]);
563 int8 xp = v_xt * p, xxp = xp * v_xt;
567 x3 += convert_long8(xxp) * convert_long8(v_xt);
569 x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
570 x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
571 x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
572 x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
574 int py = lidy * x0.s0, sy = lidy*lidy;
575 int bheight = min(tileSize_height, TILE_SIZE/2);
576 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
578 m[9][lidy-bheight] = ((long)py) * sy; // m03
579 m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
580 m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
581 m[6][lidy-bheight] = x3.s0; // m30
582 m[5][lidy-bheight] = x0.s0 * sy; // m02
583 m[4][lidy-bheight] = x1.s0 * lidy; // m11
584 m[3][lidy-bheight] = x2.s0; // m20
585 m[2][lidy-bheight] = py; // m01
586 m[1][lidy-bheight] = x1.s0; // m10
587 m[0][lidy-bheight] = x0.s0; // m00
589 else if(lidy < bheight)
591 lm[9] = ((long)py) * sy; // m03
592 lm[8] = ((long)(x1.s0)) * sy; // m12
593 lm[7] = ((long)(x2.s0)) * lidy; // m21
594 lm[6] = x3.s0; // m30
595 lm[5] = x0.s0 * sy; // m02
596 lm[4] = x1.s0 * lidy; // m11
597 lm[3] = x2.s0; // m20
599 lm[1] = x1.s0; // m10
600 lm[0] = x0.s0; // m00
602 barrier(CLK_LOCAL_MEM_FENCE);
603 for( int j = TILE_SIZE/2; j >=1; j = j/2 )
606 for( int i = 0; i < 10; i++ )
607 lm[i] = lm[i] + m[i][lidy];
608 barrier(CLK_LOCAL_MEM_FENCE);
609 if(lidy >= j/2&&lidy < j)
610 for( int i = 0; i < 10; i++ )
611 m[i][lidy-j/2] = lm[i];
612 barrier(CLK_LOCAL_MEM_FENCE);
614 if(lidy ==0 &&lidx ==0)
616 for(int mt = 0; mt < 10; mt++ )
622 for( int mt = 0; mt < 10; mt++ )
626 F xm = x * mom[0], ym = y*mom[0];
628 // accumulate moments computed in each tile
629 dst_step /= sizeof(F);
632 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
634 // + m10 ( = m10' + x*m00' )
635 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
637 // + m01 ( = m01' + y*m00' )
638 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
640 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
641 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
643 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
644 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
646 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
647 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
649 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
650 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
652 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
653 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
655 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
656 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
658 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
659 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
663 __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
665 int dst_cols, int dst_step, int blocky,
666 int depth, int cn, int coi, int binary, const int TILE_SIZE)
668 float tmp_coi[4]; // get the coi data
670 int VLEN_F = 4; // vector length of float
671 int gidy = get_global_id(0);
672 int gidx = get_global_id(1);
673 int wgidy = get_group_id(0);
674 int wgidx = get_group_id(1);
675 int lidy = get_local_id(0);
676 int lidx = get_local_id(1);
677 int y = wgidy*TILE_SIZE; // real Y index of pixel
678 int x = wgidx*TILE_SIZE; // real X index of pixel
679 int kcn = (cn==2)?2:4;
680 int rstep = min(src_step/4, TILE_SIZE);
681 int tileSize_height = min(TILE_SIZE, src_rows - y);
682 int tileSize_width = min(TILE_SIZE, src_cols -x);
683 int maxIdx = mul24(src_rows, src_cols);
684 int yOff = (y+lidy)*src_step;
687 if ( y+lidy < src_rows )
689 if(tileSize_width < TILE_SIZE)
690 for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
691 *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
693 for(int i=0; i < tileSize_width; i+=VLEN_F)
695 for(int j=0; j<4; j++)
696 tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
697 tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
700 for(int i=0; i < tileSize_width; i+=VLEN_F)
701 tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
704 float4 zero = (float4)(0);
705 float4 full = (float4)(255);
707 for(int i=0; i < tileSize_width; i+=4)
708 tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
710 __local F m[10][128];
712 for(int i = 0; i < 10; i ++)
714 barrier(CLK_LOCAL_MEM_FENCE);
720 for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
722 F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
723 F4 p = convert_F4(tmp[xt/VLEN_F]);
724 F4 xp = v_xt * p, xxp = xp * v_xt;
730 x0.s0 += x0.s1 + x0.s2 + x0.s3;
731 x1.s0 += x1.s1 + x1.s2 + x1.s3;
732 x2.s0 += x2.s1 + x2.s2 + x2.s3;
733 x3.s0 += x3.s1 + x3.s2 + x3.s3;
735 F py = lidy * x0.s0, sy = lidy*lidy;
736 int bheight = min(tileSize_height, TILE_SIZE/2);
737 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
739 m[9][lidy-bheight] = ((F)py) * sy; // m03
740 m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
741 m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
742 m[6][lidy-bheight] = x3.s0; // m30
743 m[5][lidy-bheight] = x0.s0 * sy; // m02
744 m[4][lidy-bheight] = x1.s0 * lidy; // m11
745 m[3][lidy-bheight] = x2.s0; // m20
746 m[2][lidy-bheight] = py; // m01
747 m[1][lidy-bheight] = x1.s0; // m10
748 m[0][lidy-bheight] = x0.s0; // m00
751 else if(lidy < bheight)
753 lm[9] = ((F)py) * sy; // m03
754 lm[8] = ((F)x1.s0) * sy; // m12
755 lm[7] = ((F)x2.s0) * lidy; // m21
756 lm[6] = x3.s0; // m30
757 lm[5] = x0.s0 * sy; // m02
758 lm[4] = x1.s0 * lidy; // m11
759 lm[3] = x2.s0; // m20
761 lm[1] = x1.s0; // m10
762 lm[0] = x0.s0; // m00
764 barrier(CLK_LOCAL_MEM_FENCE);
765 for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
768 for( int i = 0; i < 10; i++ )
769 lm[i] = lm[i] + m[i][lidy];
770 barrier(CLK_LOCAL_MEM_FENCE);
771 if(lidy >= j/2&&lidy < j)
772 for( int i = 0; i < 10; i++ )
773 m[i][lidy-j/2] = lm[i];
774 barrier(CLK_LOCAL_MEM_FENCE);
776 if(lidy == 0&&lidx == 0)
778 for( int mt = 0; mt < 10; mt++ )
783 for( int mt = 0; mt < 10; mt++ )
787 F xm = x * mom[0], ym = y * mom[0];
789 // accumulate moments computed in each tile
790 dst_step /= sizeof(F);
793 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
795 // + m10 ( = m10' + x*m00' )
796 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
798 // + m01 ( = m01' + y*m00' )
799 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
801 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
802 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
804 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
805 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
807 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
808 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
810 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
811 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
813 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
814 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
816 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
817 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
819 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
820 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
824 __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step,
826 int dst_cols, int dst_step, int blocky,
827 int depth, int cn, int coi, int binary, const int TILE_SIZE)
829 F tmp_coi[4]; // get the coi data
831 int VLEN_D = 4; // length of vetor
832 int gidy = get_global_id(0);
833 int gidx = get_global_id(1);
834 int wgidy = get_group_id(0);
835 int wgidx = get_group_id(1);
836 int lidy = get_local_id(0);
837 int lidx = get_local_id(1);
838 int y = wgidy*TILE_SIZE; // real Y index of pixel
839 int x = wgidx*TILE_SIZE; // real X index of pixel
840 int kcn = (cn==2)?2:4;
841 int rstep = min(src_step/8, TILE_SIZE);
842 int tileSize_height = min(TILE_SIZE, src_rows - y);
843 int tileSize_width = min(TILE_SIZE, src_cols - x);
845 if ( y+lidy < src_rows )
847 if(tileSize_width < TILE_SIZE)
848 for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
849 *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
851 for(int i=0; i < tileSize_width; i+=VLEN_D)
853 for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
854 tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
855 tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
858 for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
859 tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
865 for(int i=0; i < tileSize_width; i+=VLEN_D)
866 tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
868 __local F m[10][128];
870 for(int i=0; i<10; i++)
872 barrier(CLK_LOCAL_MEM_FENCE);
878 for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
880 F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
881 F4 p = tmp[xt/VLEN_D];
882 F4 xp = v_xt * p, xxp = xp * v_xt;
888 x0.s0 += x0.s1 + x0.s2 + x0.s3;
889 x1.s0 += x1.s1 + x1.s2 + x1.s3;
890 x2.s0 += x2.s1 + x2.s2 + x2.s3;
891 x3.s0 += x3.s1 + x3.s2 + x3.s3;
893 F py = lidy * x0.s0, sy = lidy*lidy;
894 int bheight = min(tileSize_height, TILE_SIZE/2);
895 if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
897 m[9][lidy-bheight] = ((F)py) * sy; // m03
898 m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
899 m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
900 m[6][lidy-bheight] = x3.s0; // m30
901 m[5][lidy-bheight] = x0.s0 * sy; // m02
902 m[4][lidy-bheight] = x1.s0 * lidy; // m11
903 m[3][lidy-bheight] = x2.s0; // m20
904 m[2][lidy-bheight] = py; // m01
905 m[1][lidy-bheight] = x1.s0; // m10
906 m[0][lidy-bheight] = x0.s0; // m00
908 else if(lidy < bheight)
910 lm[9] = ((F)py) * sy; // m03
911 lm[8] = ((F)x1.s0) * sy; // m12
912 lm[7] = ((F)x2.s0) * lidy; // m21
913 lm[6] = x3.s0; // m30
914 lm[5] = x0.s0 * sy; // m02
915 lm[4] = x1.s0 * lidy; // m11
916 lm[3] = x2.s0; // m20
918 lm[1] = x1.s0; // m10
919 lm[0] = x0.s0; // m00
921 barrier(CLK_LOCAL_MEM_FENCE);
923 for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
926 for( int i = 0; i < 10; i++ )
927 lm[i] = lm[i] + m[i][lidy];
928 barrier(CLK_LOCAL_MEM_FENCE);
929 if(lidy >= j/2&&lidy < j)
930 for( int i = 0; i < 10; i++ )
931 m[i][lidy-j/2] = lm[i];
932 barrier(CLK_LOCAL_MEM_FENCE);
934 if(lidy == 0&&lidx == 0)
936 for( int mt = 0; mt < 10; mt++ )
941 for( int mt = 0; mt < 10; mt++ )
945 F xm = x * mom[0], ym = y * mom[0];
947 // accumulate moments computed in each tile
948 dst_step /= sizeof(F);
951 *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
953 // + m10 ( = m10' + x*m00' )
954 *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
956 // + m01 ( = m01' + y*m00' )
957 *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
959 // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
960 *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
962 // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
963 *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
965 // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
966 *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
968 // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
969 *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
971 // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
972 *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
974 // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
975 *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
977 // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
978 *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));