1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Dachuan Zhao, dachuan@multicorewareinc.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other materials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
46 int idx_row_low(int y, int last_row)
48 return abs(y) % (last_row + 1);
51 int idx_row_high(int y, int last_row)
53 return abs(last_row - (int)abs(last_row - y)) % (last_row + 1);
56 int idx_row(int y, int last_row)
58 return idx_row_low(idx_row_high(y, last_row), last_row);
61 int idx_col_low(int x, int last_col)
63 return abs(x) % (last_col + 1);
66 int idx_col_high(int x, int last_col)
68 return abs(last_col - (int)abs(last_col - x)) % (last_col + 1);
71 int idx_col(int x, int last_col)
73 return idx_col_low(idx_col_high(x, last_col), last_col);
76 ///////////////////////////////////////////////////////////////////////
77 ////////////////////////// CV_8UC1 ///////////////////////////////////
78 ///////////////////////////////////////////////////////////////////////
80 __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstCols)
82 const int x = get_global_id(0);
83 const int y = get_group_id(1);
85 __local float smem[256 + 4];
89 const int src_y = 2*y;
90 const int last_row = srcRows - 1;
91 const int last_col = srcCols - 1;
93 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
95 sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[x]);
96 sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[x]);
97 sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[x]);
98 sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[x]);
99 sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[x]);
101 smem[2 + get_local_id(0)] = sum;
103 if (get_local_id(0) < 2)
105 const int left_x = x - 2;
107 sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[left_x]);
108 sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[left_x]);
109 sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[left_x]);
110 sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[left_x]);
111 sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[left_x]);
113 smem[get_local_id(0)] = sum;
116 if (get_local_id(0) > 253)
118 const int right_x = x + 2;
120 sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[right_x]);
121 sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[right_x]);
122 sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[right_x]);
123 sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[right_x]);
124 sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[right_x]);
126 smem[4 + get_local_id(0)] = sum;
131 int col = idx_col(x, last_col);
133 sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
134 sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
135 sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]);
136 sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
137 sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
139 smem[2 + get_local_id(0)] = sum;
141 if (get_local_id(0) < 2)
143 const int left_x = x - 2;
145 col = idx_col(left_x, last_col);
147 sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
148 sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
149 sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]);
150 sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
151 sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
153 smem[get_local_id(0)] = sum;
156 if (get_local_id(0) > 253)
158 const int right_x = x + 2;
160 col = idx_col(right_x, last_col);
162 sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
163 sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
164 sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]);
165 sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
166 sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
168 smem[4 + get_local_id(0)] = sum;
172 barrier(CLK_LOCAL_MEM_FENCE);
174 if (get_local_id(0) < 128)
176 const int tid2 = get_local_id(0) * 2;
178 sum = 0.0625f * smem[2 + tid2 - 2];
179 sum = sum + 0.25f * smem[2 + tid2 - 1];
180 sum = sum + 0.375f * smem[2 + tid2 ];
181 sum = sum + 0.25f * smem[2 + tid2 + 1];
182 sum = sum + 0.0625f * smem[2 + tid2 + 2];
184 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
187 dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum);
191 ///////////////////////////////////////////////////////////////////////
192 ////////////////////////// CV_8UC4 ///////////////////////////////////
193 ///////////////////////////////////////////////////////////////////////
195 __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols)
197 const int x = get_global_id(0);
198 const int y = get_group_id(1);
200 __local float4 smem[256 + 4];
204 const int src_y = 2*y;
205 const int last_row = srcRows - 1;
206 const int last_col = srcCols - 1;
210 float4 co3 = 0.0625f;
212 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
214 sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x]));
215 sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x]));
216 sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x]));
217 sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x]));
218 sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x]));
220 smem[2 + get_local_id(0)] = sum;
222 if (get_local_id(0) < 2)
224 const int left_x = x - 2;
226 sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x]));
227 sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x]));
228 sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x]));
229 sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x]));
230 sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x]));
232 smem[get_local_id(0)] = sum;
235 if (get_local_id(0) > 253)
237 const int right_x = x + 2;
239 sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x]));
240 sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x]));
241 sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x]));
242 sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x]));
243 sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x]));
245 smem[4 + get_local_id(0)] = sum;
250 int col = idx_col(x, last_col);
252 sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
253 sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
254 sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
255 sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
256 sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
258 smem[2 + get_local_id(0)] = sum;
260 if (get_local_id(0) < 2)
262 const int left_x = x - 2;
264 col = idx_col(left_x, last_col);
266 sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
267 sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
268 sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
269 sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
270 sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
272 smem[get_local_id(0)] = sum;
275 if (get_local_id(0) > 253)
277 const int right_x = x + 2;
279 col = idx_col(right_x, last_col);
281 sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
282 sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
283 sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
284 sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
285 sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
287 smem[4 + get_local_id(0)] = sum;
291 barrier(CLK_LOCAL_MEM_FENCE);
293 if (get_local_id(0) < 128)
295 const int tid2 = get_local_id(0) * 2;
297 sum = co3 * smem[2 + tid2 - 2];
298 sum = sum + co2 * smem[2 + tid2 - 1];
299 sum = sum + co1 * smem[2 + tid2 ];
300 sum = sum + co2 * smem[2 + tid2 + 1];
301 sum = sum + co3 * smem[2 + tid2 + 2];
303 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
306 dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum);
310 ///////////////////////////////////////////////////////////////////////
311 ////////////////////////// CV_16UC1 //////////////////////////////////
312 ///////////////////////////////////////////////////////////////////////
314 __kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols)
316 const int x = get_global_id(0);
317 const int y = get_group_id(1);
319 __local float smem[256 + 4];
323 const int src_y = 2*y;
324 const int last_row = srcRows - 1;
325 const int last_col = srcCols - 1;
327 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
329 sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
330 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
331 sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[x];
332 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
333 sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
335 smem[2 + get_local_id(0)] = sum;
337 if (get_local_id(0) < 2)
339 const int left_x = x - 2;
341 sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
342 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
343 sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
344 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
345 sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
347 smem[get_local_id(0)] = sum;
350 if (get_local_id(0) > 253)
352 const int right_x = x + 2;
354 sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
355 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
356 sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
357 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
358 sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
360 smem[4 + get_local_id(0)] = sum;
365 int col = idx_col(x, last_col);
367 sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
368 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
369 sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
370 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
371 sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
373 smem[2 + get_local_id(0)] = sum;
375 if (get_local_id(0) < 2)
377 const int left_x = x - 2;
379 col = idx_col(left_x, last_col);
381 sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
382 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
383 sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
384 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
385 sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
387 smem[get_local_id(0)] = sum;
390 if (get_local_id(0) > 253)
392 const int right_x = x + 2;
394 col = idx_col(right_x, last_col);
396 sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
397 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
398 sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
399 sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
400 sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
402 smem[4 + get_local_id(0)] = sum;
406 barrier(CLK_LOCAL_MEM_FENCE);
408 if (get_local_id(0) < 128)
410 const int tid2 = get_local_id(0) * 2;
412 sum = 0.0625f * smem[2 + tid2 - 2];
413 sum = sum + 0.25f * smem[2 + tid2 - 1];
414 sum = sum + 0.375f * smem[2 + tid2 ];
415 sum = sum + 0.25f * smem[2 + tid2 + 1];
416 sum = sum + 0.0625f * smem[2 + tid2 + 2];
418 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
421 dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum);
425 ///////////////////////////////////////////////////////////////////////
426 ////////////////////////// CV_16UC4 //////////////////////////////////
427 ///////////////////////////////////////////////////////////////////////
429 __kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols)
431 const int x = get_global_id(0);
432 const int y = get_group_id(1);
434 __local float4 smem[256 + 4];
438 const int src_y = 2*y;
439 const int last_row = srcRows - 1;
440 const int last_col = srcCols - 1;
444 float4 co3 = 0.0625f;
446 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
448 sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
449 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
450 sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]);
451 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
452 sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
454 smem[2 + get_local_id(0)] = sum;
456 if (get_local_id(0) < 2)
458 const int left_x = x - 2;
460 sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
461 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
462 sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]);
463 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
464 sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
466 smem[get_local_id(0)] = sum;
469 if (get_local_id(0) > 253)
471 const int right_x = x + 2;
473 sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
474 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
475 sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]);
476 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
477 sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
479 smem[4 + get_local_id(0)] = sum;
484 int col = idx_col(x, last_col);
486 sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
487 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
488 sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
489 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
490 sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
492 smem[2 + get_local_id(0)] = sum;
494 if (get_local_id(0) < 2)
496 const int left_x = x - 2;
498 col = idx_col(left_x, last_col);
500 sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
501 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
502 sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
503 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
504 sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
506 smem[get_local_id(0)] = sum;
509 if (get_local_id(0) > 253)
511 const int right_x = x + 2;
513 col = idx_col(right_x, last_col);
515 sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
516 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
517 sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
518 sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
519 sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
521 smem[4 + get_local_id(0)] = sum;
525 barrier(CLK_LOCAL_MEM_FENCE);
527 if (get_local_id(0) < 128)
529 const int tid2 = get_local_id(0) * 2;
531 sum = co3 * smem[2 + tid2 - 2];
532 sum = sum + co2 * smem[2 + tid2 - 1];
533 sum = sum + co1 * smem[2 + tid2 ];
534 sum = sum + co2 * smem[2 + tid2 + 1];
535 sum = sum + co3 * smem[2 + tid2 + 2];
537 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
540 dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum);
544 ///////////////////////////////////////////////////////////////////////
545 ////////////////////////// CV_16SC1 //////////////////////////////////
546 ///////////////////////////////////////////////////////////////////////
548 __kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols)
550 const int x = get_global_id(0);
551 const int y = get_group_id(1);
553 __local float smem[256 + 4];
557 const int src_y = 2*y;
558 const int last_row = srcRows - 1;
559 const int last_col = srcCols - 1;
561 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
563 sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
564 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
565 sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[x];
566 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
567 sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
569 smem[2 + get_local_id(0)] = sum;
571 if (get_local_id(0) < 2)
573 const int left_x = x - 2;
575 sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
576 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
577 sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
578 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
579 sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
581 smem[get_local_id(0)] = sum;
584 if (get_local_id(0) > 253)
586 const int right_x = x + 2;
588 sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
589 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
590 sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
591 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
592 sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
594 smem[4 + get_local_id(0)] = sum;
599 int col = idx_col(x, last_col);
601 sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
602 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
603 sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
604 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
605 sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
607 smem[2 + get_local_id(0)] = sum;
609 if (get_local_id(0) < 2)
611 const int left_x = x - 2;
613 col = idx_col(left_x, last_col);
615 sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
616 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
617 sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
618 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
619 sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
621 smem[get_local_id(0)] = sum;
624 if (get_local_id(0) > 253)
626 const int right_x = x + 2;
628 col = idx_col(right_x, last_col);
630 sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
631 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
632 sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
633 sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
634 sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
636 smem[4 + get_local_id(0)] = sum;
640 barrier(CLK_LOCAL_MEM_FENCE);
642 if (get_local_id(0) < 128)
644 const int tid2 = get_local_id(0) * 2;
646 sum = 0.0625f * smem[2 + tid2 - 2];
647 sum = sum + 0.25f * smem[2 + tid2 - 1];
648 sum = sum + 0.375f * smem[2 + tid2 ];
649 sum = sum + 0.25f * smem[2 + tid2 + 1];
650 sum = sum + 0.0625f * smem[2 + tid2 + 2];
652 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
655 dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum);
659 ///////////////////////////////////////////////////////////////////////
660 ////////////////////////// CV_16SC4 //////////////////////////////////
661 ///////////////////////////////////////////////////////////////////////
663 __kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols)
665 const int x = get_global_id(0);
666 const int y = get_group_id(1);
668 __local float4 smem[256 + 4];
672 const int src_y = 2*y;
673 const int last_row = srcRows - 1;
674 const int last_col = srcCols - 1;
678 float4 co3 = 0.0625f;
680 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
682 sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
683 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
684 sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]);
685 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
686 sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
688 smem[2 + get_local_id(0)] = sum;
690 if (get_local_id(0) < 2)
692 const int left_x = x - 2;
694 sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
695 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
696 sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]);
697 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
698 sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
700 smem[get_local_id(0)] = sum;
703 if (get_local_id(0) > 253)
705 const int right_x = x + 2;
707 sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
708 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
709 sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]);
710 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
711 sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
713 smem[4 + get_local_id(0)] = sum;
718 int col = idx_col(x, last_col);
720 sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
721 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
722 sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
723 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
724 sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
726 smem[2 + get_local_id(0)] = sum;
728 if (get_local_id(0) < 2)
730 const int left_x = x - 2;
732 col = idx_col(left_x, last_col);
734 sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
735 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
736 sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
737 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
738 sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
740 smem[get_local_id(0)] = sum;
743 if (get_local_id(0) > 253)
745 const int right_x = x + 2;
747 col = idx_col(right_x, last_col);
749 sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
750 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
751 sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
752 sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
753 sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
755 smem[4 + get_local_id(0)] = sum;
759 barrier(CLK_LOCAL_MEM_FENCE);
761 if (get_local_id(0) < 128)
763 const int tid2 = get_local_id(0) * 2;
765 sum = co3 * smem[2 + tid2 - 2];
766 sum = sum + co2 * smem[2 + tid2 - 1];
767 sum = sum + co1 * smem[2 + tid2 ];
768 sum = sum + co2 * smem[2 + tid2 + 1];
769 sum = sum + co3 * smem[2 + tid2 + 2];
771 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
774 dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum);
778 ///////////////////////////////////////////////////////////////////////
779 ////////////////////////// CV_32FC1 //////////////////////////////////
780 ///////////////////////////////////////////////////////////////////////
782 __kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols)
784 const int x = get_global_id(0);
785 const int y = get_group_id(1);
787 __local float smem[256 + 4];
791 const int src_y = 2*y;
792 const int last_row = srcRows - 1;
793 const int last_col = srcCols - 1;
795 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
797 sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
798 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
799 sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[x];
800 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
801 sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
803 smem[2 + get_local_id(0)] = sum;
805 if (get_local_id(0) < 2)
807 const int left_x = x - 2;
809 sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
810 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
811 sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
812 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
813 sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
815 smem[get_local_id(0)] = sum;
818 if (get_local_id(0) > 253)
820 const int right_x = x + 2;
822 sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
823 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
824 sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
825 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
826 sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
828 smem[4 + get_local_id(0)] = sum;
833 int col = idx_col(x, last_col);
835 sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
836 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
837 sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
838 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
839 sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
841 smem[2 + get_local_id(0)] = sum;
843 if (get_local_id(0) < 2)
845 const int left_x = x - 2;
847 col = idx_col(left_x, last_col);
849 sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
850 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
851 sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
852 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
853 sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
855 smem[get_local_id(0)] = sum;
858 if (get_local_id(0) > 253)
860 const int right_x = x + 2;
862 col = idx_col(right_x, last_col);
864 sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
865 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
866 sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
867 sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
868 sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
870 smem[4 + get_local_id(0)] = sum;
874 barrier(CLK_LOCAL_MEM_FENCE);
876 if (get_local_id(0) < 128)
878 const int tid2 = get_local_id(0) * 2;
880 sum = 0.0625f * smem[2 + tid2 - 2];
881 sum = sum + 0.25f * smem[2 + tid2 - 1];
882 sum = sum + 0.375f * smem[2 + tid2 ];
883 sum = sum + 0.25f * smem[2 + tid2 + 1];
884 sum = sum + 0.0625f * smem[2 + tid2 + 2];
886 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
889 dst[y * dstStep / 4 + dst_x] = sum;
893 ///////////////////////////////////////////////////////////////////////
894 ////////////////////////// CV_32FC4 //////////////////////////////////
895 ///////////////////////////////////////////////////////////////////////
897 __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols)
899 const int x = get_global_id(0);
900 const int y = get_group_id(1);
902 __local float4 smem[256 + 4];
906 const int src_y = 2*y;
907 const int last_row = srcRows - 1;
908 const int last_col = srcCols - 1;
912 float4 co3 = 0.0625f;
914 if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
916 sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x];
917 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x];
918 sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x];
919 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x];
920 sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x];
922 smem[2 + get_local_id(0)] = sum;
924 if (get_local_id(0) < 2)
926 const int left_x = x - 2;
928 sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x];
929 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x];
930 sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x];
931 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x];
932 sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x];
934 smem[get_local_id(0)] = sum;
937 if (get_local_id(0) > 253)
939 const int right_x = x + 2;
941 sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x];
942 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x];
943 sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x];
944 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x];
945 sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x];
947 smem[4 + get_local_id(0)] = sum;
952 int col = idx_col(x, last_col);
954 sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
955 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
956 sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
957 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
958 sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
960 smem[2 + get_local_id(0)] = sum;
962 if (get_local_id(0) < 2)
964 const int left_x = x - 2;
966 col = idx_col(left_x, last_col);
968 sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
969 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
970 sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
971 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
972 sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
974 smem[get_local_id(0)] = sum;
977 if (get_local_id(0) > 253)
979 const int right_x = x + 2;
981 col = idx_col(right_x, last_col);
983 sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
984 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
985 sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
986 sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
987 sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
989 smem[4 + get_local_id(0)] = sum;
993 barrier(CLK_LOCAL_MEM_FENCE);
995 if (get_local_id(0) < 128)
997 const int tid2 = get_local_id(0) * 2;
999 sum = co3 * smem[2 + tid2 - 2];
1000 sum = sum + co2 * smem[2 + tid2 - 1];
1001 sum = sum + co1 * smem[2 + tid2 ];
1002 sum = sum + co2 * smem[2 + tid2 + 1];
1003 sum = sum + co3 * smem[2 + tid2 + 2];
1005 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
1007 if (dst_x < dstCols)
1008 dst[y * dstStep / 16 + dst_x] = sum;