d00bec84aa22886420a9a429ce7928ab25afe5e5
[profile/ivi/opencv.git] / modules / gpu / src / cuda / column_filter.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////\r
2 //\r
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
4 //\r
5 //  By downloading, copying, installing or using the software you agree to this license.\r
6 //  If you do not agree to this license, do not download, install,\r
7 //  copy or use the software.\r
8 //\r
9 //\r
10 //                           License Agreement\r
11 //                For Open Source Computer Vision Library\r
12 //\r
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
15 // Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.\r
16 // Third party copyrights are property of their respective owners.\r
17 //\r
18 // Redistribution and use in source and binary forms, with or without modification,\r
19 // are permitted provided that the following conditions are met:\r
20 //\r
21 //   * Redistribution's of source code must retain the above copyright notice,\r
22 //     this list of conditions and the following disclaimer.\r
23 //\r
24 //   * Redistribution's in binary form must reproduce the above copyright notice,\r
25 //     this list of conditions and the following disclaimer in the documentation\r
26 //     and/or other materials provided with the distribution.\r
27 //\r
28 //   * The name of the copyright holders may not be used to endorse or promote products\r
29 //     derived from this software without specific prior written permission.\r
30 //\r
31 // This software is provided by the copyright holders and contributors "as is" and\r
32 // any express or implied warranties, including, but not limited to, the implied\r
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.\r
34 // In no event shall the Intel Corporation or contributors be liable for any direct,\r
35 // indirect, incidental, special, exemplary, or consequential damages\r
36 // (including, but not limited to, procurement of substitute goods or services;\r
37 // loss of use, data, or profits; or business interruption) however caused\r
38 // and on any theory of liability, whether in contract, strict liability,\r
39 // or tort (including negligence or otherwise) arising in any way out of\r
40 // the use of this software, even if advised of the possibility of such damage.\r
41 //\r
42 //M*/\r
43 \r
44 #include "internal_shared.hpp"\r
45 #include "opencv2/gpu/device/saturate_cast.hpp"\r
46 #include "opencv2/gpu/device/vec_math.hpp"\r
47 #include "opencv2/gpu/device/limits.hpp"\r
48 #include "opencv2/gpu/device/border_interpolate.hpp"\r
49 #include "opencv2/gpu/device/static_check.hpp"\r
50 \r
51 namespace cv { namespace gpu { namespace device \r
52 {\r
53     namespace column_filter \r
54     {\r
55         #define MAX_KERNEL_SIZE 32\r
56 \r
57         __constant__ float c_kernel[MAX_KERNEL_SIZE];\r
58 \r
59         void loadKernel(const float kernel[], int ksize)\r
60         {\r
61             cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
62         }\r
63 \r
64         template <int KSIZE, typename T, typename D, typename B>\r
65         __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)\r
66         {\r
67             #if __CUDA_ARCH__ >= 200\r
68                 const int BLOCK_DIM_X = 16;\r
69                 const int BLOCK_DIM_Y = 16;\r
70                 const int PATCH_PER_BLOCK = 4;\r
71                 const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;\r
72             #else\r
73                 const int BLOCK_DIM_X = 16;\r
74                 const int BLOCK_DIM_Y = 8;\r
75                 const int PATCH_PER_BLOCK = 2;\r
76                 const int HALO_SIZE = 2;\r
77             #endif\r
78 \r
79             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
80 \r
81             __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];\r
82 \r
83             const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;\r
84 \r
85             if (x >= src.cols)\r
86                 return;\r
87 \r
88             const T* src_col = src.ptr() + x;\r
89 \r
90             const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;\r
91 \r
92             //Upper halo\r
93             #pragma unroll\r
94             for (int j = 0; j < HALO_SIZE; ++j)\r
95                 smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));\r
96 \r
97             //Main data\r
98             #pragma unroll\r
99             for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
100                 smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));\r
101 \r
102             //Lower halo\r
103             #pragma unroll\r
104             for (int j = 0; j < HALO_SIZE; ++j)\r
105                 smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));\r
106 \r
107             __syncthreads();\r
108 \r
109             #pragma unroll\r
110             for (int j = 0; j < PATCH_PER_BLOCK; ++j)\r
111             {\r
112                 const int y = yStart + j * BLOCK_DIM_Y;\r
113 \r
114                 if (y < src.rows)\r
115                 {\r
116                     sum_t sum = VecTraits<sum_t>::all(0);\r
117 \r
118                     #pragma unroll\r
119                     for (int k = 0; k < KSIZE; ++k)\r
120                         sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];\r
121 \r
122                     dst(y, x) = saturate_cast<D>(sum);\r
123                 }\r
124             }\r
125         }\r
126 \r
127         template <int KSIZE, typename T, typename D, template<typename> class B>\r
128         void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)\r
129         {\r
130             int BLOCK_DIM_X;\r
131             int BLOCK_DIM_Y;\r
132             int PATCH_PER_BLOCK;\r
133 \r
134             if (cc >= 20)\r
135             {\r
136                 BLOCK_DIM_X = 16;\r
137                 BLOCK_DIM_Y = 16;\r
138                 PATCH_PER_BLOCK = 4;\r
139             }\r
140             else\r
141             {\r
142                 BLOCK_DIM_X = 16;\r
143                 BLOCK_DIM_Y = 8;\r
144                 PATCH_PER_BLOCK = 2;\r
145             }\r
146 \r
147             const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
148             const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));\r
149             \r
150             B<T> brd(src.rows);\r
151 \r
152             linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);\r
153 \r
154             cudaSafeCall( cudaGetLastError() );\r
155 \r
156             if (stream == 0)\r
157                 cudaSafeCall( cudaDeviceSynchronize() );\r
158         }\r
159 \r
160         template <typename T, typename D>\r
161         void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)\r
162         {\r
163             typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);\r
164 \r
165             static const caller_t callers[5][33] = \r
166             {\r
167                 {\r
168                     0,\r
169                     linearColumnFilter_caller< 1, T, D, BrdColReflect101>,\r
170                     linearColumnFilter_caller< 2, T, D, BrdColReflect101>,\r
171                     linearColumnFilter_caller< 3, T, D, BrdColReflect101>,\r
172                     linearColumnFilter_caller< 4, T, D, BrdColReflect101>,\r
173                     linearColumnFilter_caller< 5, T, D, BrdColReflect101>,\r
174                     linearColumnFilter_caller< 6, T, D, BrdColReflect101>,\r
175                     linearColumnFilter_caller< 7, T, D, BrdColReflect101>,\r
176                     linearColumnFilter_caller< 8, T, D, BrdColReflect101>,\r
177                     linearColumnFilter_caller< 9, T, D, BrdColReflect101>,\r
178                     linearColumnFilter_caller<10, T, D, BrdColReflect101>,\r
179                     linearColumnFilter_caller<11, T, D, BrdColReflect101>,\r
180                     linearColumnFilter_caller<12, T, D, BrdColReflect101>,\r
181                     linearColumnFilter_caller<13, T, D, BrdColReflect101>,\r
182                     linearColumnFilter_caller<14, T, D, BrdColReflect101>,\r
183                     linearColumnFilter_caller<15, T, D, BrdColReflect101>,\r
184                     linearColumnFilter_caller<16, T, D, BrdColReflect101>,\r
185                     linearColumnFilter_caller<17, T, D, BrdColReflect101>,\r
186                     linearColumnFilter_caller<18, T, D, BrdColReflect101>,\r
187                     linearColumnFilter_caller<19, T, D, BrdColReflect101>,\r
188                     linearColumnFilter_caller<20, T, D, BrdColReflect101>,\r
189                     linearColumnFilter_caller<21, T, D, BrdColReflect101>,\r
190                     linearColumnFilter_caller<22, T, D, BrdColReflect101>,\r
191                     linearColumnFilter_caller<23, T, D, BrdColReflect101>,\r
192                     linearColumnFilter_caller<24, T, D, BrdColReflect101>,\r
193                     linearColumnFilter_caller<25, T, D, BrdColReflect101>,\r
194                     linearColumnFilter_caller<26, T, D, BrdColReflect101>,\r
195                     linearColumnFilter_caller<27, T, D, BrdColReflect101>,\r
196                     linearColumnFilter_caller<28, T, D, BrdColReflect101>,\r
197                     linearColumnFilter_caller<29, T, D, BrdColReflect101>,\r
198                     linearColumnFilter_caller<30, T, D, BrdColReflect101>,\r
199                     linearColumnFilter_caller<31, T, D, BrdColReflect101>,\r
200                     linearColumnFilter_caller<32, T, D, BrdColReflect101>\r
201                 },\r
202                 {\r
203                     0,\r
204                     linearColumnFilter_caller< 1, T, D, BrdColReplicate>,\r
205                     linearColumnFilter_caller< 2, T, D, BrdColReplicate>,\r
206                     linearColumnFilter_caller< 3, T, D, BrdColReplicate>,\r
207                     linearColumnFilter_caller< 4, T, D, BrdColReplicate>,\r
208                     linearColumnFilter_caller< 5, T, D, BrdColReplicate>,\r
209                     linearColumnFilter_caller< 6, T, D, BrdColReplicate>,\r
210                     linearColumnFilter_caller< 7, T, D, BrdColReplicate>,\r
211                     linearColumnFilter_caller< 8, T, D, BrdColReplicate>,\r
212                     linearColumnFilter_caller< 9, T, D, BrdColReplicate>,\r
213                     linearColumnFilter_caller<10, T, D, BrdColReplicate>,\r
214                     linearColumnFilter_caller<11, T, D, BrdColReplicate>,\r
215                     linearColumnFilter_caller<12, T, D, BrdColReplicate>,\r
216                     linearColumnFilter_caller<13, T, D, BrdColReplicate>,\r
217                     linearColumnFilter_caller<14, T, D, BrdColReplicate>,\r
218                     linearColumnFilter_caller<15, T, D, BrdColReplicate>,\r
219                     linearColumnFilter_caller<16, T, D, BrdColReplicate>,\r
220                     linearColumnFilter_caller<17, T, D, BrdColReplicate>,\r
221                     linearColumnFilter_caller<18, T, D, BrdColReplicate>,\r
222                     linearColumnFilter_caller<19, T, D, BrdColReplicate>,\r
223                     linearColumnFilter_caller<20, T, D, BrdColReplicate>,\r
224                     linearColumnFilter_caller<21, T, D, BrdColReplicate>,\r
225                     linearColumnFilter_caller<22, T, D, BrdColReplicate>,\r
226                     linearColumnFilter_caller<23, T, D, BrdColReplicate>,\r
227                     linearColumnFilter_caller<24, T, D, BrdColReplicate>,\r
228                     linearColumnFilter_caller<25, T, D, BrdColReplicate>,\r
229                     linearColumnFilter_caller<26, T, D, BrdColReplicate>,\r
230                     linearColumnFilter_caller<27, T, D, BrdColReplicate>,\r
231                     linearColumnFilter_caller<28, T, D, BrdColReplicate>,\r
232                     linearColumnFilter_caller<29, T, D, BrdColReplicate>,\r
233                     linearColumnFilter_caller<30, T, D, BrdColReplicate>,\r
234                     linearColumnFilter_caller<31, T, D, BrdColReplicate>,\r
235                     linearColumnFilter_caller<32, T, D, BrdColReplicate>\r
236                 },\r
237                 {\r
238                     0,\r
239                     linearColumnFilter_caller< 1, T, D, BrdColConstant>,\r
240                     linearColumnFilter_caller< 2, T, D, BrdColConstant>,\r
241                     linearColumnFilter_caller< 3, T, D, BrdColConstant>,\r
242                     linearColumnFilter_caller< 4, T, D, BrdColConstant>,\r
243                     linearColumnFilter_caller< 5, T, D, BrdColConstant>,\r
244                     linearColumnFilter_caller< 6, T, D, BrdColConstant>,\r
245                     linearColumnFilter_caller< 7, T, D, BrdColConstant>,\r
246                     linearColumnFilter_caller< 8, T, D, BrdColConstant>,\r
247                     linearColumnFilter_caller< 9, T, D, BrdColConstant>,\r
248                     linearColumnFilter_caller<10, T, D, BrdColConstant>,\r
249                     linearColumnFilter_caller<11, T, D, BrdColConstant>,\r
250                     linearColumnFilter_caller<12, T, D, BrdColConstant>,\r
251                     linearColumnFilter_caller<13, T, D, BrdColConstant>,\r
252                     linearColumnFilter_caller<14, T, D, BrdColConstant>,\r
253                     linearColumnFilter_caller<15, T, D, BrdColConstant>,\r
254                     linearColumnFilter_caller<16, T, D, BrdColConstant>,\r
255                     linearColumnFilter_caller<17, T, D, BrdColConstant>,\r
256                     linearColumnFilter_caller<18, T, D, BrdColConstant>,\r
257                     linearColumnFilter_caller<19, T, D, BrdColConstant>,\r
258                     linearColumnFilter_caller<20, T, D, BrdColConstant>,\r
259                     linearColumnFilter_caller<21, T, D, BrdColConstant>,\r
260                     linearColumnFilter_caller<22, T, D, BrdColConstant>,\r
261                     linearColumnFilter_caller<23, T, D, BrdColConstant>,\r
262                     linearColumnFilter_caller<24, T, D, BrdColConstant>,\r
263                     linearColumnFilter_caller<25, T, D, BrdColConstant>,\r
264                     linearColumnFilter_caller<26, T, D, BrdColConstant>,\r
265                     linearColumnFilter_caller<27, T, D, BrdColConstant>,\r
266                     linearColumnFilter_caller<28, T, D, BrdColConstant>,\r
267                     linearColumnFilter_caller<29, T, D, BrdColConstant>,\r
268                     linearColumnFilter_caller<30, T, D, BrdColConstant>,\r
269                     linearColumnFilter_caller<31, T, D, BrdColConstant>,\r
270                     linearColumnFilter_caller<32, T, D, BrdColConstant>\r
271                 },\r
272                 {\r
273                     0,\r
274                     linearColumnFilter_caller< 1, T, D, BrdColReflect>,\r
275                     linearColumnFilter_caller< 2, T, D, BrdColReflect>,\r
276                     linearColumnFilter_caller< 3, T, D, BrdColReflect>,\r
277                     linearColumnFilter_caller< 4, T, D, BrdColReflect>,\r
278                     linearColumnFilter_caller< 5, T, D, BrdColReflect>,\r
279                     linearColumnFilter_caller< 6, T, D, BrdColReflect>,\r
280                     linearColumnFilter_caller< 7, T, D, BrdColReflect>,\r
281                     linearColumnFilter_caller< 8, T, D, BrdColReflect>,\r
282                     linearColumnFilter_caller< 9, T, D, BrdColReflect>,\r
283                     linearColumnFilter_caller<10, T, D, BrdColReflect>,\r
284                     linearColumnFilter_caller<11, T, D, BrdColReflect>,\r
285                     linearColumnFilter_caller<12, T, D, BrdColReflect>,\r
286                     linearColumnFilter_caller<13, T, D, BrdColReflect>,\r
287                     linearColumnFilter_caller<14, T, D, BrdColReflect>,\r
288                     linearColumnFilter_caller<15, T, D, BrdColReflect>,\r
289                     linearColumnFilter_caller<16, T, D, BrdColReflect>,\r
290                     linearColumnFilter_caller<17, T, D, BrdColReflect>,\r
291                     linearColumnFilter_caller<18, T, D, BrdColReflect>,\r
292                     linearColumnFilter_caller<19, T, D, BrdColReflect>,\r
293                     linearColumnFilter_caller<20, T, D, BrdColReflect>,\r
294                     linearColumnFilter_caller<21, T, D, BrdColReflect>,\r
295                     linearColumnFilter_caller<22, T, D, BrdColReflect>,\r
296                     linearColumnFilter_caller<23, T, D, BrdColReflect>,\r
297                     linearColumnFilter_caller<24, T, D, BrdColReflect>,\r
298                     linearColumnFilter_caller<25, T, D, BrdColReflect>,\r
299                     linearColumnFilter_caller<26, T, D, BrdColReflect>,\r
300                     linearColumnFilter_caller<27, T, D, BrdColReflect>,\r
301                     linearColumnFilter_caller<28, T, D, BrdColReflect>,\r
302                     linearColumnFilter_caller<29, T, D, BrdColReflect>,\r
303                     linearColumnFilter_caller<30, T, D, BrdColReflect>,\r
304                     linearColumnFilter_caller<31, T, D, BrdColReflect>,\r
305                     linearColumnFilter_caller<32, T, D, BrdColReflect>\r
306                 },\r
307                 {\r
308                     0,\r
309                     linearColumnFilter_caller< 1, T, D, BrdColWrap>,\r
310                     linearColumnFilter_caller< 2, T, D, BrdColWrap>,\r
311                     linearColumnFilter_caller< 3, T, D, BrdColWrap>,\r
312                     linearColumnFilter_caller< 4, T, D, BrdColWrap>,\r
313                     linearColumnFilter_caller< 5, T, D, BrdColWrap>,\r
314                     linearColumnFilter_caller< 6, T, D, BrdColWrap>,\r
315                     linearColumnFilter_caller< 7, T, D, BrdColWrap>,\r
316                     linearColumnFilter_caller< 8, T, D, BrdColWrap>,\r
317                     linearColumnFilter_caller< 9, T, D, BrdColWrap>,\r
318                     linearColumnFilter_caller<10, T, D, BrdColWrap>,\r
319                     linearColumnFilter_caller<11, T, D, BrdColWrap>,\r
320                     linearColumnFilter_caller<12, T, D, BrdColWrap>,\r
321                     linearColumnFilter_caller<13, T, D, BrdColWrap>,\r
322                     linearColumnFilter_caller<14, T, D, BrdColWrap>,\r
323                     linearColumnFilter_caller<15, T, D, BrdColWrap>,\r
324                     linearColumnFilter_caller<16, T, D, BrdColWrap>,\r
325                     linearColumnFilter_caller<17, T, D, BrdColWrap>,\r
326                     linearColumnFilter_caller<18, T, D, BrdColWrap>,\r
327                     linearColumnFilter_caller<19, T, D, BrdColWrap>,\r
328                     linearColumnFilter_caller<20, T, D, BrdColWrap>,\r
329                     linearColumnFilter_caller<21, T, D, BrdColWrap>,\r
330                     linearColumnFilter_caller<22, T, D, BrdColWrap>,\r
331                     linearColumnFilter_caller<23, T, D, BrdColWrap>,\r
332                     linearColumnFilter_caller<24, T, D, BrdColWrap>,\r
333                     linearColumnFilter_caller<25, T, D, BrdColWrap>,\r
334                     linearColumnFilter_caller<26, T, D, BrdColWrap>,\r
335                     linearColumnFilter_caller<27, T, D, BrdColWrap>,\r
336                     linearColumnFilter_caller<28, T, D, BrdColWrap>,\r
337                     linearColumnFilter_caller<29, T, D, BrdColWrap>,\r
338                     linearColumnFilter_caller<30, T, D, BrdColWrap>,\r
339                     linearColumnFilter_caller<31, T, D, BrdColWrap>,\r
340                     linearColumnFilter_caller<32, T, D, BrdColWrap>\r
341                 }               \r
342             };\r
343             \r
344             loadKernel(kernel, ksize);\r
345 \r
346             callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);\r
347         }\r
348 \r
349         template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
350         template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
351         template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
352         template void linearColumnFilter_gpu<float , int   >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
353         template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);\r
354     } // namespace column_filter\r
355 }}} // namespace cv { namespace gpu { namespace device\r