gpu separable filters for CV_8UC3, CV_32FC3 and CV_32FC4 types
[profile/ivi/opencv.git] / modules / gpu / src / cuda / column_filter.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 #if !defined CUDA_DISABLER
45
46 #include "internal_shared.hpp"
47 #include "opencv2/gpu/device/saturate_cast.hpp"
48 #include "opencv2/gpu/device/vec_math.hpp"
49 #include "opencv2/gpu/device/limits.hpp"
50 #include "opencv2/gpu/device/border_interpolate.hpp"
51 #include "opencv2/gpu/device/static_check.hpp"
52
53 namespace cv { namespace gpu { namespace device
54 {
55     namespace column_filter
56     {
57         #define MAX_KERNEL_SIZE 32
58
59         __constant__ float c_kernel[MAX_KERNEL_SIZE];
60
61         void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
62         {
63             if (stream == 0)
64                 cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
65             else
66                 cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
67         }
68
69         template <int KSIZE, typename T, typename D, typename B>
70         __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
71         {
72             #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
73                 const int BLOCK_DIM_X = 16;
74                 const int BLOCK_DIM_Y = 16;
75                 const int PATCH_PER_BLOCK = 4;
76                 const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
77             #else
78                 const int BLOCK_DIM_X = 16;
79                 const int BLOCK_DIM_Y = 8;
80                 const int PATCH_PER_BLOCK = 2;
81                 const int HALO_SIZE = 2;
82             #endif
83
84             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
85
86             __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
87
88             const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
89
90             if (x >= src.cols)
91                 return;
92
93             const T* src_col = src.ptr() + x;
94
95             const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
96
97             if (blockIdx.y > 0)
98             {
99                 //Upper halo
100                 #pragma unroll
101                 for (int j = 0; j < HALO_SIZE; ++j)
102                     smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
103             }
104             else
105             {
106                 //Upper halo
107                 #pragma unroll
108                 for (int j = 0; j < HALO_SIZE; ++j)
109                     smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
110             }
111
112             if (blockIdx.y + 2 < gridDim.y)
113             {
114                 //Main data
115                 #pragma unroll
116                 for (int j = 0; j < PATCH_PER_BLOCK; ++j)
117                     smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
118
119                 //Lower halo
120                 #pragma unroll
121                 for (int j = 0; j < HALO_SIZE; ++j)
122                     smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
123             }
124             else
125             {
126                 //Main data
127                 #pragma unroll
128                 for (int j = 0; j < PATCH_PER_BLOCK; ++j)
129                     smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
130
131                 //Lower halo
132                 #pragma unroll
133                 for (int j = 0; j < HALO_SIZE; ++j)
134                     smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
135             }
136
137             __syncthreads();
138
139             #pragma unroll
140             for (int j = 0; j < PATCH_PER_BLOCK; ++j)
141             {
142                 const int y = yStart + j * BLOCK_DIM_Y;
143
144                 if (y < src.rows)
145                 {
146                     sum_t sum = VecTraits<sum_t>::all(0);
147
148                     #pragma unroll
149                     for (int k = 0; k < KSIZE; ++k)
150                         sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
151
152                     dst(y, x) = saturate_cast<D>(sum);
153                 }
154             }
155         }
156
157         template <int KSIZE, typename T, typename D, template<typename> class B>
158         void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
159         {
160             int BLOCK_DIM_X;
161             int BLOCK_DIM_Y;
162             int PATCH_PER_BLOCK;
163
164             if (cc >= 20)
165             {
166                 BLOCK_DIM_X = 16;
167                 BLOCK_DIM_Y = 16;
168                 PATCH_PER_BLOCK = 4;
169             }
170             else
171             {
172                 BLOCK_DIM_X = 16;
173                 BLOCK_DIM_Y = 8;
174                 PATCH_PER_BLOCK = 2;
175             }
176
177             const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
178             const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
179
180             B<T> brd(src.rows);
181
182             linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
183
184             cudaSafeCall( cudaGetLastError() );
185
186             if (stream == 0)
187                 cudaSafeCall( cudaDeviceSynchronize() );
188         }
189
190         template <typename T, typename D>
191         void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
192         {
193             typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
194
195             static const caller_t callers[5][33] =
196             {
197                 {
198                     0,
199                     linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
200                     linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
201                     linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
202                     linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
203                     linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
204                     linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
205                     linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
206                     linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
207                     linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
208                     linearColumnFilter_caller<10, T, D, BrdColReflect101>,
209                     linearColumnFilter_caller<11, T, D, BrdColReflect101>,
210                     linearColumnFilter_caller<12, T, D, BrdColReflect101>,
211                     linearColumnFilter_caller<13, T, D, BrdColReflect101>,
212                     linearColumnFilter_caller<14, T, D, BrdColReflect101>,
213                     linearColumnFilter_caller<15, T, D, BrdColReflect101>,
214                     linearColumnFilter_caller<16, T, D, BrdColReflect101>,
215                     linearColumnFilter_caller<17, T, D, BrdColReflect101>,
216                     linearColumnFilter_caller<18, T, D, BrdColReflect101>,
217                     linearColumnFilter_caller<19, T, D, BrdColReflect101>,
218                     linearColumnFilter_caller<20, T, D, BrdColReflect101>,
219                     linearColumnFilter_caller<21, T, D, BrdColReflect101>,
220                     linearColumnFilter_caller<22, T, D, BrdColReflect101>,
221                     linearColumnFilter_caller<23, T, D, BrdColReflect101>,
222                     linearColumnFilter_caller<24, T, D, BrdColReflect101>,
223                     linearColumnFilter_caller<25, T, D, BrdColReflect101>,
224                     linearColumnFilter_caller<26, T, D, BrdColReflect101>,
225                     linearColumnFilter_caller<27, T, D, BrdColReflect101>,
226                     linearColumnFilter_caller<28, T, D, BrdColReflect101>,
227                     linearColumnFilter_caller<29, T, D, BrdColReflect101>,
228                     linearColumnFilter_caller<30, T, D, BrdColReflect101>,
229                     linearColumnFilter_caller<31, T, D, BrdColReflect101>,
230                     linearColumnFilter_caller<32, T, D, BrdColReflect101>
231                 },
232                 {
233                     0,
234                     linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
235                     linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
236                     linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
237                     linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
238                     linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
239                     linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
240                     linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
241                     linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
242                     linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
243                     linearColumnFilter_caller<10, T, D, BrdColReplicate>,
244                     linearColumnFilter_caller<11, T, D, BrdColReplicate>,
245                     linearColumnFilter_caller<12, T, D, BrdColReplicate>,
246                     linearColumnFilter_caller<13, T, D, BrdColReplicate>,
247                     linearColumnFilter_caller<14, T, D, BrdColReplicate>,
248                     linearColumnFilter_caller<15, T, D, BrdColReplicate>,
249                     linearColumnFilter_caller<16, T, D, BrdColReplicate>,
250                     linearColumnFilter_caller<17, T, D, BrdColReplicate>,
251                     linearColumnFilter_caller<18, T, D, BrdColReplicate>,
252                     linearColumnFilter_caller<19, T, D, BrdColReplicate>,
253                     linearColumnFilter_caller<20, T, D, BrdColReplicate>,
254                     linearColumnFilter_caller<21, T, D, BrdColReplicate>,
255                     linearColumnFilter_caller<22, T, D, BrdColReplicate>,
256                     linearColumnFilter_caller<23, T, D, BrdColReplicate>,
257                     linearColumnFilter_caller<24, T, D, BrdColReplicate>,
258                     linearColumnFilter_caller<25, T, D, BrdColReplicate>,
259                     linearColumnFilter_caller<26, T, D, BrdColReplicate>,
260                     linearColumnFilter_caller<27, T, D, BrdColReplicate>,
261                     linearColumnFilter_caller<28, T, D, BrdColReplicate>,
262                     linearColumnFilter_caller<29, T, D, BrdColReplicate>,
263                     linearColumnFilter_caller<30, T, D, BrdColReplicate>,
264                     linearColumnFilter_caller<31, T, D, BrdColReplicate>,
265                     linearColumnFilter_caller<32, T, D, BrdColReplicate>
266                 },
267                 {
268                     0,
269                     linearColumnFilter_caller< 1, T, D, BrdColConstant>,
270                     linearColumnFilter_caller< 2, T, D, BrdColConstant>,
271                     linearColumnFilter_caller< 3, T, D, BrdColConstant>,
272                     linearColumnFilter_caller< 4, T, D, BrdColConstant>,
273                     linearColumnFilter_caller< 5, T, D, BrdColConstant>,
274                     linearColumnFilter_caller< 6, T, D, BrdColConstant>,
275                     linearColumnFilter_caller< 7, T, D, BrdColConstant>,
276                     linearColumnFilter_caller< 8, T, D, BrdColConstant>,
277                     linearColumnFilter_caller< 9, T, D, BrdColConstant>,
278                     linearColumnFilter_caller<10, T, D, BrdColConstant>,
279                     linearColumnFilter_caller<11, T, D, BrdColConstant>,
280                     linearColumnFilter_caller<12, T, D, BrdColConstant>,
281                     linearColumnFilter_caller<13, T, D, BrdColConstant>,
282                     linearColumnFilter_caller<14, T, D, BrdColConstant>,
283                     linearColumnFilter_caller<15, T, D, BrdColConstant>,
284                     linearColumnFilter_caller<16, T, D, BrdColConstant>,
285                     linearColumnFilter_caller<17, T, D, BrdColConstant>,
286                     linearColumnFilter_caller<18, T, D, BrdColConstant>,
287                     linearColumnFilter_caller<19, T, D, BrdColConstant>,
288                     linearColumnFilter_caller<20, T, D, BrdColConstant>,
289                     linearColumnFilter_caller<21, T, D, BrdColConstant>,
290                     linearColumnFilter_caller<22, T, D, BrdColConstant>,
291                     linearColumnFilter_caller<23, T, D, BrdColConstant>,
292                     linearColumnFilter_caller<24, T, D, BrdColConstant>,
293                     linearColumnFilter_caller<25, T, D, BrdColConstant>,
294                     linearColumnFilter_caller<26, T, D, BrdColConstant>,
295                     linearColumnFilter_caller<27, T, D, BrdColConstant>,
296                     linearColumnFilter_caller<28, T, D, BrdColConstant>,
297                     linearColumnFilter_caller<29, T, D, BrdColConstant>,
298                     linearColumnFilter_caller<30, T, D, BrdColConstant>,
299                     linearColumnFilter_caller<31, T, D, BrdColConstant>,
300                     linearColumnFilter_caller<32, T, D, BrdColConstant>
301                 },
302                 {
303                     0,
304                     linearColumnFilter_caller< 1, T, D, BrdColReflect>,
305                     linearColumnFilter_caller< 2, T, D, BrdColReflect>,
306                     linearColumnFilter_caller< 3, T, D, BrdColReflect>,
307                     linearColumnFilter_caller< 4, T, D, BrdColReflect>,
308                     linearColumnFilter_caller< 5, T, D, BrdColReflect>,
309                     linearColumnFilter_caller< 6, T, D, BrdColReflect>,
310                     linearColumnFilter_caller< 7, T, D, BrdColReflect>,
311                     linearColumnFilter_caller< 8, T, D, BrdColReflect>,
312                     linearColumnFilter_caller< 9, T, D, BrdColReflect>,
313                     linearColumnFilter_caller<10, T, D, BrdColReflect>,
314                     linearColumnFilter_caller<11, T, D, BrdColReflect>,
315                     linearColumnFilter_caller<12, T, D, BrdColReflect>,
316                     linearColumnFilter_caller<13, T, D, BrdColReflect>,
317                     linearColumnFilter_caller<14, T, D, BrdColReflect>,
318                     linearColumnFilter_caller<15, T, D, BrdColReflect>,
319                     linearColumnFilter_caller<16, T, D, BrdColReflect>,
320                     linearColumnFilter_caller<17, T, D, BrdColReflect>,
321                     linearColumnFilter_caller<18, T, D, BrdColReflect>,
322                     linearColumnFilter_caller<19, T, D, BrdColReflect>,
323                     linearColumnFilter_caller<20, T, D, BrdColReflect>,
324                     linearColumnFilter_caller<21, T, D, BrdColReflect>,
325                     linearColumnFilter_caller<22, T, D, BrdColReflect>,
326                     linearColumnFilter_caller<23, T, D, BrdColReflect>,
327                     linearColumnFilter_caller<24, T, D, BrdColReflect>,
328                     linearColumnFilter_caller<25, T, D, BrdColReflect>,
329                     linearColumnFilter_caller<26, T, D, BrdColReflect>,
330                     linearColumnFilter_caller<27, T, D, BrdColReflect>,
331                     linearColumnFilter_caller<28, T, D, BrdColReflect>,
332                     linearColumnFilter_caller<29, T, D, BrdColReflect>,
333                     linearColumnFilter_caller<30, T, D, BrdColReflect>,
334                     linearColumnFilter_caller<31, T, D, BrdColReflect>,
335                     linearColumnFilter_caller<32, T, D, BrdColReflect>
336                 },
337                 {
338                     0,
339                     linearColumnFilter_caller< 1, T, D, BrdColWrap>,
340                     linearColumnFilter_caller< 2, T, D, BrdColWrap>,
341                     linearColumnFilter_caller< 3, T, D, BrdColWrap>,
342                     linearColumnFilter_caller< 4, T, D, BrdColWrap>,
343                     linearColumnFilter_caller< 5, T, D, BrdColWrap>,
344                     linearColumnFilter_caller< 6, T, D, BrdColWrap>,
345                     linearColumnFilter_caller< 7, T, D, BrdColWrap>,
346                     linearColumnFilter_caller< 8, T, D, BrdColWrap>,
347                     linearColumnFilter_caller< 9, T, D, BrdColWrap>,
348                     linearColumnFilter_caller<10, T, D, BrdColWrap>,
349                     linearColumnFilter_caller<11, T, D, BrdColWrap>,
350                     linearColumnFilter_caller<12, T, D, BrdColWrap>,
351                     linearColumnFilter_caller<13, T, D, BrdColWrap>,
352                     linearColumnFilter_caller<14, T, D, BrdColWrap>,
353                     linearColumnFilter_caller<15, T, D, BrdColWrap>,
354                     linearColumnFilter_caller<16, T, D, BrdColWrap>,
355                     linearColumnFilter_caller<17, T, D, BrdColWrap>,
356                     linearColumnFilter_caller<18, T, D, BrdColWrap>,
357                     linearColumnFilter_caller<19, T, D, BrdColWrap>,
358                     linearColumnFilter_caller<20, T, D, BrdColWrap>,
359                     linearColumnFilter_caller<21, T, D, BrdColWrap>,
360                     linearColumnFilter_caller<22, T, D, BrdColWrap>,
361                     linearColumnFilter_caller<23, T, D, BrdColWrap>,
362                     linearColumnFilter_caller<24, T, D, BrdColWrap>,
363                     linearColumnFilter_caller<25, T, D, BrdColWrap>,
364                     linearColumnFilter_caller<26, T, D, BrdColWrap>,
365                     linearColumnFilter_caller<27, T, D, BrdColWrap>,
366                     linearColumnFilter_caller<28, T, D, BrdColWrap>,
367                     linearColumnFilter_caller<29, T, D, BrdColWrap>,
368                     linearColumnFilter_caller<30, T, D, BrdColWrap>,
369                     linearColumnFilter_caller<31, T, D, BrdColWrap>,
370                     linearColumnFilter_caller<32, T, D, BrdColWrap>
371                 }
372             };
373
374             loadKernel(kernel, ksize, stream);
375
376             callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
377         }
378
379         template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
380         template void linearColumnFilter_gpu<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
381         template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
382         template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
383         template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
384         template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
385         template void linearColumnFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
386         template void linearColumnFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
387     } // namespace column_filter
388 }}} // namespace cv { namespace gpu { namespace device
389
390
391 #endif /* CUDA_DISABLER */