gpu separable filters for CV_8UC3, CV_32FC3 and CV_32FC4 types
[profile/ivi/opencv.git] / modules / gpu / src / cuda / row_filter.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 #if !defined CUDA_DISABLER
45
46 #include "internal_shared.hpp"
47 #include "opencv2/gpu/device/saturate_cast.hpp"
48 #include "opencv2/gpu/device/vec_math.hpp"
49 #include "opencv2/gpu/device/limits.hpp"
50 #include "opencv2/gpu/device/border_interpolate.hpp"
51 #include "opencv2/gpu/device/static_check.hpp"
52
53 namespace cv { namespace gpu { namespace device
54 {
55     namespace row_filter
56     {
57         #define MAX_KERNEL_SIZE 32
58
59         __constant__ float c_kernel[MAX_KERNEL_SIZE];
60
61         void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
62         {
63             if (stream == 0)
64                 cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
65             else
66                 cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
67         }
68
69         template <int KSIZE, typename T, typename D, typename B>
70         __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
71         {
72             #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
73                 const int BLOCK_DIM_X = 32;
74                 const int BLOCK_DIM_Y = 8;
75                 const int PATCH_PER_BLOCK = 4;
76                 const int HALO_SIZE = 1;
77             #else
78                 const int BLOCK_DIM_X = 32;
79                 const int BLOCK_DIM_Y = 4;
80                 const int PATCH_PER_BLOCK = 4;
81                 const int HALO_SIZE = 1;
82             #endif
83
84             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
85
86             __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
87
88             const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
89
90             if (y >= src.rows)
91                 return;
92
93             const T* src_row = src.ptr(y);
94
95             const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
96
97             if (blockIdx.x > 0)
98             {
99                 //Load left halo
100                 #pragma unroll
101                 for (int j = 0; j < HALO_SIZE; ++j)
102                     smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
103             }
104             else
105             {
106                 //Load left halo
107                 #pragma unroll
108                 for (int j = 0; j < HALO_SIZE; ++j)
109                     smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
110             }
111
112             if (blockIdx.x + 2 < gridDim.x)
113             {
114                 //Load main data
115                 #pragma unroll
116                 for (int j = 0; j < PATCH_PER_BLOCK; ++j)
117                     smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
118
119                 //Load right halo
120                 #pragma unroll
121                 for (int j = 0; j < HALO_SIZE; ++j)
122                     smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
123             }
124             else
125             {
126                 //Load main data
127                 #pragma unroll
128                 for (int j = 0; j < PATCH_PER_BLOCK; ++j)
129                     smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
130
131                 //Load right halo
132                 #pragma unroll
133                 for (int j = 0; j < HALO_SIZE; ++j)
134                     smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
135             }
136
137             __syncthreads();
138
139             #pragma unroll
140             for (int j = 0; j < PATCH_PER_BLOCK; ++j)
141             {
142                 const int x = xStart + j * BLOCK_DIM_X;
143
144                 if (x < src.cols)
145                 {
146                     sum_t sum = VecTraits<sum_t>::all(0);
147
148                     #pragma unroll
149                     for (int k = 0; k < KSIZE; ++k)
150                         sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
151
152                     dst(y, x) = saturate_cast<D>(sum);
153                 }
154             }
155         }
156
157         template <int KSIZE, typename T, typename D, template<typename> class B>
158         void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
159         {
160             int BLOCK_DIM_X;
161             int BLOCK_DIM_Y;
162             int PATCH_PER_BLOCK;
163
164             if (cc >= 20)
165             {
166                 BLOCK_DIM_X = 32;
167                 BLOCK_DIM_Y = 8;
168                 PATCH_PER_BLOCK = 4;
169             }
170             else
171             {
172                 BLOCK_DIM_X = 32;
173                 BLOCK_DIM_Y = 4;
174                 PATCH_PER_BLOCK = 4;
175             }
176
177             const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
178             const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
179
180             B<T> brd(src.cols);
181
182             linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
183             cudaSafeCall( cudaGetLastError() );
184
185             if (stream == 0)
186                 cudaSafeCall( cudaDeviceSynchronize() );
187         }
188
189         template <typename T, typename D>
190         void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
191         {
192             typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
193
194             static const caller_t callers[5][33] =
195             {
196                 {
197                     0,
198                     linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
199                     linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
200                     linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
201                     linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
202                     linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
203                     linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
204                     linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
205                     linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
206                     linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
207                     linearRowFilter_caller<10, T, D, BrdRowReflect101>,
208                     linearRowFilter_caller<11, T, D, BrdRowReflect101>,
209                     linearRowFilter_caller<12, T, D, BrdRowReflect101>,
210                     linearRowFilter_caller<13, T, D, BrdRowReflect101>,
211                     linearRowFilter_caller<14, T, D, BrdRowReflect101>,
212                     linearRowFilter_caller<15, T, D, BrdRowReflect101>,
213                     linearRowFilter_caller<16, T, D, BrdRowReflect101>,
214                     linearRowFilter_caller<17, T, D, BrdRowReflect101>,
215                     linearRowFilter_caller<18, T, D, BrdRowReflect101>,
216                     linearRowFilter_caller<19, T, D, BrdRowReflect101>,
217                     linearRowFilter_caller<20, T, D, BrdRowReflect101>,
218                     linearRowFilter_caller<21, T, D, BrdRowReflect101>,
219                     linearRowFilter_caller<22, T, D, BrdRowReflect101>,
220                     linearRowFilter_caller<23, T, D, BrdRowReflect101>,
221                     linearRowFilter_caller<24, T, D, BrdRowReflect101>,
222                     linearRowFilter_caller<25, T, D, BrdRowReflect101>,
223                     linearRowFilter_caller<26, T, D, BrdRowReflect101>,
224                     linearRowFilter_caller<27, T, D, BrdRowReflect101>,
225                     linearRowFilter_caller<28, T, D, BrdRowReflect101>,
226                     linearRowFilter_caller<29, T, D, BrdRowReflect101>,
227                     linearRowFilter_caller<30, T, D, BrdRowReflect101>,
228                     linearRowFilter_caller<31, T, D, BrdRowReflect101>,
229                     linearRowFilter_caller<32, T, D, BrdRowReflect101>
230                 },
231                 {
232                     0,
233                     linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
234                     linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
235                     linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
236                     linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
237                     linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
238                     linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
239                     linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
240                     linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
241                     linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
242                     linearRowFilter_caller<10, T, D, BrdRowReplicate>,
243                     linearRowFilter_caller<11, T, D, BrdRowReplicate>,
244                     linearRowFilter_caller<12, T, D, BrdRowReplicate>,
245                     linearRowFilter_caller<13, T, D, BrdRowReplicate>,
246                     linearRowFilter_caller<14, T, D, BrdRowReplicate>,
247                     linearRowFilter_caller<15, T, D, BrdRowReplicate>,
248                     linearRowFilter_caller<16, T, D, BrdRowReplicate>,
249                     linearRowFilter_caller<17, T, D, BrdRowReplicate>,
250                     linearRowFilter_caller<18, T, D, BrdRowReplicate>,
251                     linearRowFilter_caller<19, T, D, BrdRowReplicate>,
252                     linearRowFilter_caller<20, T, D, BrdRowReplicate>,
253                     linearRowFilter_caller<21, T, D, BrdRowReplicate>,
254                     linearRowFilter_caller<22, T, D, BrdRowReplicate>,
255                     linearRowFilter_caller<23, T, D, BrdRowReplicate>,
256                     linearRowFilter_caller<24, T, D, BrdRowReplicate>,
257                     linearRowFilter_caller<25, T, D, BrdRowReplicate>,
258                     linearRowFilter_caller<26, T, D, BrdRowReplicate>,
259                     linearRowFilter_caller<27, T, D, BrdRowReplicate>,
260                     linearRowFilter_caller<28, T, D, BrdRowReplicate>,
261                     linearRowFilter_caller<29, T, D, BrdRowReplicate>,
262                     linearRowFilter_caller<30, T, D, BrdRowReplicate>,
263                     linearRowFilter_caller<31, T, D, BrdRowReplicate>,
264                     linearRowFilter_caller<32, T, D, BrdRowReplicate>
265                 },
266                 {
267                     0,
268                     linearRowFilter_caller< 1, T, D, BrdRowConstant>,
269                     linearRowFilter_caller< 2, T, D, BrdRowConstant>,
270                     linearRowFilter_caller< 3, T, D, BrdRowConstant>,
271                     linearRowFilter_caller< 4, T, D, BrdRowConstant>,
272                     linearRowFilter_caller< 5, T, D, BrdRowConstant>,
273                     linearRowFilter_caller< 6, T, D, BrdRowConstant>,
274                     linearRowFilter_caller< 7, T, D, BrdRowConstant>,
275                     linearRowFilter_caller< 8, T, D, BrdRowConstant>,
276                     linearRowFilter_caller< 9, T, D, BrdRowConstant>,
277                     linearRowFilter_caller<10, T, D, BrdRowConstant>,
278                     linearRowFilter_caller<11, T, D, BrdRowConstant>,
279                     linearRowFilter_caller<12, T, D, BrdRowConstant>,
280                     linearRowFilter_caller<13, T, D, BrdRowConstant>,
281                     linearRowFilter_caller<14, T, D, BrdRowConstant>,
282                     linearRowFilter_caller<15, T, D, BrdRowConstant>,
283                     linearRowFilter_caller<16, T, D, BrdRowConstant>,
284                     linearRowFilter_caller<17, T, D, BrdRowConstant>,
285                     linearRowFilter_caller<18, T, D, BrdRowConstant>,
286                     linearRowFilter_caller<19, T, D, BrdRowConstant>,
287                     linearRowFilter_caller<20, T, D, BrdRowConstant>,
288                     linearRowFilter_caller<21, T, D, BrdRowConstant>,
289                     linearRowFilter_caller<22, T, D, BrdRowConstant>,
290                     linearRowFilter_caller<23, T, D, BrdRowConstant>,
291                     linearRowFilter_caller<24, T, D, BrdRowConstant>,
292                     linearRowFilter_caller<25, T, D, BrdRowConstant>,
293                     linearRowFilter_caller<26, T, D, BrdRowConstant>,
294                     linearRowFilter_caller<27, T, D, BrdRowConstant>,
295                     linearRowFilter_caller<28, T, D, BrdRowConstant>,
296                     linearRowFilter_caller<29, T, D, BrdRowConstant>,
297                     linearRowFilter_caller<30, T, D, BrdRowConstant>,
298                     linearRowFilter_caller<31, T, D, BrdRowConstant>,
299                     linearRowFilter_caller<32, T, D, BrdRowConstant>
300                 },
301                 {
302                     0,
303                     linearRowFilter_caller< 1, T, D, BrdRowReflect>,
304                     linearRowFilter_caller< 2, T, D, BrdRowReflect>,
305                     linearRowFilter_caller< 3, T, D, BrdRowReflect>,
306                     linearRowFilter_caller< 4, T, D, BrdRowReflect>,
307                     linearRowFilter_caller< 5, T, D, BrdRowReflect>,
308                     linearRowFilter_caller< 6, T, D, BrdRowReflect>,
309                     linearRowFilter_caller< 7, T, D, BrdRowReflect>,
310                     linearRowFilter_caller< 8, T, D, BrdRowReflect>,
311                     linearRowFilter_caller< 9, T, D, BrdRowReflect>,
312                     linearRowFilter_caller<10, T, D, BrdRowReflect>,
313                     linearRowFilter_caller<11, T, D, BrdRowReflect>,
314                     linearRowFilter_caller<12, T, D, BrdRowReflect>,
315                     linearRowFilter_caller<13, T, D, BrdRowReflect>,
316                     linearRowFilter_caller<14, T, D, BrdRowReflect>,
317                     linearRowFilter_caller<15, T, D, BrdRowReflect>,
318                     linearRowFilter_caller<16, T, D, BrdRowReflect>,
319                     linearRowFilter_caller<17, T, D, BrdRowReflect>,
320                     linearRowFilter_caller<18, T, D, BrdRowReflect>,
321                     linearRowFilter_caller<19, T, D, BrdRowReflect>,
322                     linearRowFilter_caller<20, T, D, BrdRowReflect>,
323                     linearRowFilter_caller<21, T, D, BrdRowReflect>,
324                     linearRowFilter_caller<22, T, D, BrdRowReflect>,
325                     linearRowFilter_caller<23, T, D, BrdRowReflect>,
326                     linearRowFilter_caller<24, T, D, BrdRowReflect>,
327                     linearRowFilter_caller<25, T, D, BrdRowReflect>,
328                     linearRowFilter_caller<26, T, D, BrdRowReflect>,
329                     linearRowFilter_caller<27, T, D, BrdRowReflect>,
330                     linearRowFilter_caller<28, T, D, BrdRowReflect>,
331                     linearRowFilter_caller<29, T, D, BrdRowReflect>,
332                     linearRowFilter_caller<30, T, D, BrdRowReflect>,
333                     linearRowFilter_caller<31, T, D, BrdRowReflect>,
334                     linearRowFilter_caller<32, T, D, BrdRowReflect>
335                 },
336                 {
337                     0,
338                     linearRowFilter_caller< 1, T, D, BrdRowWrap>,
339                     linearRowFilter_caller< 2, T, D, BrdRowWrap>,
340                     linearRowFilter_caller< 3, T, D, BrdRowWrap>,
341                     linearRowFilter_caller< 4, T, D, BrdRowWrap>,
342                     linearRowFilter_caller< 5, T, D, BrdRowWrap>,
343                     linearRowFilter_caller< 6, T, D, BrdRowWrap>,
344                     linearRowFilter_caller< 7, T, D, BrdRowWrap>,
345                     linearRowFilter_caller< 8, T, D, BrdRowWrap>,
346                     linearRowFilter_caller< 9, T, D, BrdRowWrap>,
347                     linearRowFilter_caller<10, T, D, BrdRowWrap>,
348                     linearRowFilter_caller<11, T, D, BrdRowWrap>,
349                     linearRowFilter_caller<12, T, D, BrdRowWrap>,
350                     linearRowFilter_caller<13, T, D, BrdRowWrap>,
351                     linearRowFilter_caller<14, T, D, BrdRowWrap>,
352                     linearRowFilter_caller<15, T, D, BrdRowWrap>,
353                     linearRowFilter_caller<16, T, D, BrdRowWrap>,
354                     linearRowFilter_caller<17, T, D, BrdRowWrap>,
355                     linearRowFilter_caller<18, T, D, BrdRowWrap>,
356                     linearRowFilter_caller<19, T, D, BrdRowWrap>,
357                     linearRowFilter_caller<20, T, D, BrdRowWrap>,
358                     linearRowFilter_caller<21, T, D, BrdRowWrap>,
359                     linearRowFilter_caller<22, T, D, BrdRowWrap>,
360                     linearRowFilter_caller<23, T, D, BrdRowWrap>,
361                     linearRowFilter_caller<24, T, D, BrdRowWrap>,
362                     linearRowFilter_caller<25, T, D, BrdRowWrap>,
363                     linearRowFilter_caller<26, T, D, BrdRowWrap>,
364                     linearRowFilter_caller<27, T, D, BrdRowWrap>,
365                     linearRowFilter_caller<28, T, D, BrdRowWrap>,
366                     linearRowFilter_caller<29, T, D, BrdRowWrap>,
367                     linearRowFilter_caller<30, T, D, BrdRowWrap>,
368                     linearRowFilter_caller<31, T, D, BrdRowWrap>,
369                     linearRowFilter_caller<32, T, D, BrdRowWrap>
370                 }
371             };
372
373             loadKernel(kernel, ksize, stream);
374
375             callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
376         }
377
378         template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
379         template void linearRowFilter_gpu<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
380         template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
381         template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
382         template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
383         template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
384         template void linearRowFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
385         template void linearRowFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
386     } // namespace row_filter
387 }}} // namespace cv { namespace gpu { namespace device
388
389
390 #endif /* CUDA_DISABLER */