1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
6 __constant float kx[] = { KERNEL_MATRIX_X };
7 __constant float ky[] = { KERNEL_MATRIX_Y };
9 #define OP(y, x) (convert_float4(arr[y * 5 + x]) * ky[y] * kx[x])
11 #define FILL_ARR(s1, s2, n, e1, e2) \
12 arr[5 * n + 0] = row_s ? (uchar4)(s1, s2, line[n].s23) : (uchar4)(line[n].s0123); \
13 arr[5 * n + 1] = row_s ? (uchar4)(s2, line[n].s234) : (uchar4)(line[n].s1234); \
14 arr[5 * n + 2] = (uchar4)(line[n].s2345); \
15 arr[5 * n + 3] = row_e ? (uchar4)(line[n].s345, e1) : (uchar4)(line[n].s3456); \
16 arr[5 * n + 4] = row_e ? (uchar4)(line[n].s45, e1, e2) : (uchar4)(line[n].s4567);
18 __kernel void gaussianBlur5x5_8UC1_cols4(__global const uchar* src, int src_step,
19 __global uint* dst, int dst_step, int rows, int cols)
21 int x = get_global_id(0) * 4;
22 int y = get_global_id(1);
24 if (x >= cols || y >= rows) return;
27 int offset, src_index;
29 src_index = x + (y - 2) * src_step - 2;
30 offset = max(0, src_index + 2 * src_step);
31 line[2] = vload8(0, src + offset);
32 if (offset == 0) line[2] = (uchar8)(0, 0, line[2].s0123, line[2].s45);
34 #if defined BORDER_CONSTANT || defined BORDER_REPLICATE
36 #ifdef BORDER_CONSTANT
38 #elif defined BORDER_REPLICATE
41 line[0] = line[1] = tmp;
44 offset = max(0, src_index);
45 line[0] = vload8(0, src + offset);
46 if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
51 offset = max(0, src_index + src_step);
52 line[1] = vload8(0, src + offset);
53 if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);
56 line[3] = (y == (rows - 1)) ? tmp : vload8(0, src + src_index + 3 * src_step);
57 line[4] = (y >= (rows - 2)) ? tmp : vload8(0, src + src_index + 4 * src_step);
60 t = (y <= 1) ? (abs(y - 1) - y + 2) : 0;
61 offset = max(0, src_index + t * src_step);
62 line[0] = vload8(0, src + offset);
63 if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
69 offset = max(0, src_index + 1 * src_step);
70 line[1] = vload8(0, src + offset);
71 if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[0].s45);
74 line[3] = (y == (rows - 1)) ? line[2] : vload8(0, src + src_index + 3 * src_step);
76 t = (y >= (rows - 2)) ? (abs(y - (rows - 1)) - (y - (rows - 2)) + 2) : 4;
77 line[4] = vload8(0, src + src_index + t * src_step);
78 #elif BORDER_REFLECT_101
83 offset = (y == 0) ? (src_index + 4 * src_step) : max(0, src_index);
84 line[0] = vload8(0, src + offset);
85 if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
88 offset = (y == 0) ? (src_index + 3 * src_step) : max(0, src_index + 1 * src_step);
89 line[1] = vload8(0, src + offset);
90 if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);
92 line[3] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 3) * src_step);
97 line[4] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 4) * src_step);
101 bool row_s = (x == 0);
102 bool row_e = ((x + 4) == cols);
106 #ifdef BORDER_CONSTANT
109 FILL_ARR(s, s, 0, e, e);
110 FILL_ARR(s, s, 1, e, e);
111 FILL_ARR(s, s, 2, e, e);
112 FILL_ARR(s, s, 3, e, e);
113 FILL_ARR(s, s, 4, e, e);
114 #elif defined BORDER_REPLICATE
117 FILL_ARR(s, s, 0, e, e);
121 FILL_ARR(s, s, 1, e, e);
125 FILL_ARR(s, s, 2, e, e);
129 FILL_ARR(s, s, 3, e, e);
133 FILL_ARR(s, s, 4, e, e);
142 FILL_ARR(s1, s2, 0, e1, e2);
148 FILL_ARR(s1, s2, 1, e1, e2);
154 FILL_ARR(s1, s2, 2, e1, e2);
160 FILL_ARR(s1, s2, 3, e1, e2);
166 FILL_ARR(s1, s2, 4, e1, e2);
167 #elif BORDER_REFLECT_101
170 FILL_ARR(s, e, 0, s, e);
174 FILL_ARR(s, e, 1, s, e);
178 FILL_ARR(s, e, 2, s, e);
182 FILL_ARR(s, e, 3, s, e);
186 FILL_ARR(s, e, 4, s, e);
190 sum = OP(0, 0) + OP(0, 1) + OP(0, 2) + OP(0, 3) + OP(0, 4) +
191 OP(1, 0) + OP(1, 1) + OP(1, 2) + OP(1, 3) + OP(1, 4) +
192 OP(2, 0) + OP(2, 1) + OP(2, 2) + OP(2, 3) + OP(2, 4) +
193 OP(3, 0) + OP(3, 1) + OP(3, 2) + OP(3, 3) + OP(3, 4) +
194 OP(4, 0) + OP(4, 1) + OP(4, 2) + OP(4, 3) + OP(4, 4);
196 int dst_index = (x / 4) + y * (dst_step / 4);
197 dst[dst_index] = as_uint(convert_uchar4_sat_rte(sum));