added dual tvl1 optical flow gpu implementation
[profile/ivi/opencv.git] / modules / gpu / src / cuda / stereocsbp.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #if !defined CUDA_DISABLER
44
45 #include "opencv2/gpu/device/common.hpp"
46 #include "opencv2/gpu/device/saturate_cast.hpp"
47 #include "opencv2/gpu/device/limits.hpp"
48 #include "opencv2/gpu/device/reduce.hpp"
49 #include "opencv2/gpu/device/functional.hpp"
50
51 namespace cv { namespace gpu { namespace device
52 {
53     namespace stereocsbp
54     {
55         ///////////////////////////////////////////////////////////////
56         /////////////////////// load constants ////////////////////////
57         ///////////////////////////////////////////////////////////////
58
59         __constant__ int cndisp;
60
61         __constant__ float cmax_data_term;
62         __constant__ float cdata_weight;
63         __constant__ float cmax_disc_term;
64         __constant__ float cdisc_single_jump;
65
66         __constant__ int cth;
67
68         __constant__ size_t cimg_step;
69         __constant__ size_t cmsg_step;
70         __constant__ size_t cdisp_step1;
71         __constant__ size_t cdisp_step2;
72
73         __constant__ uchar* cleft;
74         __constant__ uchar* cright;
75         __constant__ uchar* ctemp;
76
77
78         void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
79                             const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
80         {
81             cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
82
83             cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
84             cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
85             cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
86             cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
87
88             cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
89
90             cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
91
92             cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
93             cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
94             cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
95         }
96
97         ///////////////////////////////////////////////////////////////
98         /////////////////////// init data cost ////////////////////////
99         ///////////////////////////////////////////////////////////////
100
101         template <int channels> struct DataCostPerPixel;
102         template <> struct DataCostPerPixel<1>
103         {
104             static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
105             {
106                 return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
107             }
108         };
109         template <> struct DataCostPerPixel<3>
110         {
111             static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
112             {
113                 float tb = 0.114f * ::abs((int)left[0] - right[0]);
114                 float tg = 0.587f * ::abs((int)left[1] - right[1]);
115                 float tr = 0.299f * ::abs((int)left[2] - right[2]);
116
117                 return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
118             }
119         };
120         template <> struct DataCostPerPixel<4>
121         {
122             static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
123             {
124                 uchar4 l = *((const uchar4*)left);
125                 uchar4 r = *((const uchar4*)right);
126
127                 float tb = 0.114f * ::abs((int)l.x - r.x);
128                 float tg = 0.587f * ::abs((int)l.y - r.y);
129                 float tr = 0.299f * ::abs((int)l.z - r.z);
130
131                 return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
132             }
133         };
134
135         template <typename T>
136         __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
137         {
138             int x = blockIdx.x * blockDim.x + threadIdx.x;
139             int y = blockIdx.y * blockDim.y + threadIdx.y;
140
141             if (y < h && x < w)
142             {
143                 T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
144                 T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
145                 T* data_cost = (T*)ctemp + y * cmsg_step + x;
146
147                 for(int i = 0; i < nr_plane; i++)
148                 {
149                     T minimum = device::numeric_limits<T>::max();
150                     int id = 0;
151                     for(int d = 0; d < cndisp; d++)
152                     {
153                         T cur = data_cost[d * cdisp_step1];
154                         if(cur < minimum)
155                         {
156                             minimum = cur;
157                             id = d;
158                         }
159                     }
160
161                     data_cost_selected[i  * cdisp_step1] = minimum;
162                     selected_disparity[i  * cdisp_step1] = id;
163                     data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
164                 }
165             }
166         }
167
168
169         template <typename T>
170         __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
171         {
172             int x = blockIdx.x * blockDim.x + threadIdx.x;
173             int y = blockIdx.y * blockDim.y + threadIdx.y;
174
175             if (y < h && x < w)
176             {
177                 T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
178                 T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
179                 T* data_cost = (T*)ctemp + y * cmsg_step + x;
180
181                 int nr_local_minimum = 0;
182
183                 T prev = data_cost[0 * cdisp_step1];
184                 T cur  = data_cost[1 * cdisp_step1];
185                 T next = data_cost[2 * cdisp_step1];
186
187                 for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
188                 {
189                     if (cur < prev && cur < next)
190                     {
191                         data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
192                         selected_disparity[nr_local_minimum * cdisp_step1] = d;
193
194                         data_cost[d * cdisp_step1] = numeric_limits<T>::max();
195
196                         nr_local_minimum++;
197                     }
198                     prev = cur;
199                     cur = next;
200                     next = data_cost[(d + 1) * cdisp_step1];
201                 }
202
203                 for (int i = nr_local_minimum; i < nr_plane; i++)
204                 {
205                     T minimum = numeric_limits<T>::max();
206                     int id = 0;
207
208                     for (int d = 0; d < cndisp; d++)
209                     {
210                         cur = data_cost[d * cdisp_step1];
211                         if (cur < minimum)
212                         {
213                             minimum = cur;
214                             id = d;
215                         }
216                     }
217                     data_cost_selected[i * cdisp_step1] = minimum;
218                     selected_disparity[i * cdisp_step1] = id;
219
220                     data_cost[id * cdisp_step1] = numeric_limits<T>::max();
221                 }
222             }
223         }
224
225         template <typename T, int channels>
226         __global__ void init_data_cost(int h, int w, int level)
227         {
228             int x = blockIdx.x * blockDim.x + threadIdx.x;
229             int y = blockIdx.y * blockDim.y + threadIdx.y;
230
231             if (y < h && x < w)
232             {
233                 int y0 = y << level;
234                 int yt = (y + 1) << level;
235
236                 int x0 = x << level;
237                 int xt = (x + 1) << level;
238
239                 T* data_cost = (T*)ctemp + y * cmsg_step + x;
240
241                 for(int d = 0; d < cndisp; ++d)
242                 {
243                     float val = 0.0f;
244                     for(int yi = y0; yi < yt; yi++)
245                     {
246                         for(int xi = x0; xi < xt; xi++)
247                         {
248                             int xr = xi - d;
249                             if(d < cth || xr < 0)
250                                 val += cdata_weight * cmax_data_term;
251                             else
252                             {
253                                 const uchar* lle = cleft + yi * cimg_step + xi * channels;
254                                 const uchar* lri = cright + yi * cimg_step + xr * channels;
255
256                                 val += DataCostPerPixel<channels>::compute(lle, lri);
257                             }
258                         }
259                     }
260                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
261                 }
262             }
263         }
264
265         template <typename T, int winsz, int channels>
266         __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
267         {
268             int x_out = blockIdx.x;
269             int y_out = blockIdx.y % h;
270             int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
271
272             int tid = threadIdx.x;
273
274             if (d < cndisp)
275             {
276                 int x0 = x_out << level;
277                 int y0 = y_out << level;
278
279                 int len = ::min(y0 + winsz, rows) - y0;
280
281                 float val = 0.0f;
282                 if (x0 + tid < cols)
283                 {
284                     if (x0 + tid - d < 0 || d < cth)
285                         val = cdata_weight * cmax_data_term * len;
286                     else
287                     {
288                         const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
289                         const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
290
291                         for(int y = 0; y < len; ++y)
292                         {
293                             val += DataCostPerPixel<channels>::compute(lle, lri);
294
295                             lle += cimg_step;
296                             lri += cimg_step;
297                         }
298                     }
299                 }
300
301                 extern __shared__ float smem[];
302
303                 reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
304
305                 T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
306
307                 if (tid == 0)
308                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
309             }
310         }
311
312
313         template <typename T>
314         void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
315         {
316             dim3 threads(32, 8, 1);
317             dim3 grid(1, 1, 1);
318
319             grid.x = divUp(w, threads.x);
320             grid.y = divUp(h, threads.y);
321
322             switch (channels)
323             {
324             case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
325             case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
326             case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
327             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "init_data_cost_caller_");
328             }
329         }
330
331         template <typename T, int winsz>
332         void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
333         {
334             const int threadsNum = 256;
335             const size_t smem_size = threadsNum * sizeof(float);
336
337             dim3 threads(winsz, 1, threadsNum / winsz);
338             dim3 grid(w, h, 1);
339             grid.y *= divUp(ndisp, threads.z);
340
341             switch (channels)
342             {
343             case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
344             case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
345             case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
346             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "init_data_cost_reduce_caller_");
347             }
348         }
349
350         template<class T>
351         void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
352                     int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
353         {
354
355             typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
356
357             static const InitDataCostCaller init_data_cost_callers[] =
358             {
359                 init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
360                 init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
361                 init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
362             };
363
364             size_t disp_step = msg_step * h;
365             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
366             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
367
368             init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
369             cudaSafeCall( cudaGetLastError() );
370
371             if (stream == 0)
372                 cudaSafeCall( cudaDeviceSynchronize() );
373
374             dim3 threads(32, 8, 1);
375             dim3 grid(1, 1, 1);
376
377             grid.x = divUp(w, threads.x);
378             grid.y = divUp(h, threads.y);
379
380             if (use_local_init_data_cost == true)
381                 get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
382             else
383                 get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
384
385             cudaSafeCall( cudaGetLastError() );
386
387             if (stream == 0)
388                 cudaSafeCall( cudaDeviceSynchronize() );
389         }
390
391         template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
392                     int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
393
394         template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
395                     int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
396
397         ///////////////////////////////////////////////////////////////
398         ////////////////////// compute data cost //////////////////////
399         ///////////////////////////////////////////////////////////////
400
401         template <typename T, int channels>
402         __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
403         {
404             int x = blockIdx.x * blockDim.x + threadIdx.x;
405             int y = blockIdx.y * blockDim.y + threadIdx.y;
406
407             if (y < h && x < w)
408             {
409                 int y0 = y << level;
410                 int yt = (y + 1) << level;
411
412                 int x0 = x << level;
413                 int xt = (x + 1) << level;
414
415                 const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step + x/2;
416                 T* data_cost = data_cost_ + y * cmsg_step + x;
417
418                 for(int d = 0; d < nr_plane; d++)
419                 {
420                     float val = 0.0f;
421                     for(int yi = y0; yi < yt; yi++)
422                     {
423                         for(int xi = x0; xi < xt; xi++)
424                         {
425                             int sel_disp = selected_disparity[d * cdisp_step2];
426                             int xr = xi - sel_disp;
427
428                             if (xr < 0 || sel_disp < cth)
429                                 val += cdata_weight * cmax_data_term;
430                             else
431                             {
432                                 const uchar* left_x = cleft + yi * cimg_step + xi * channels;
433                                 const uchar* right_x = cright + yi * cimg_step + xr * channels;
434
435                                 val += DataCostPerPixel<channels>::compute(left_x, right_x);
436                             }
437                         }
438                     }
439                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
440                 }
441             }
442         }
443
444         template <typename T, int winsz, int channels>
445         __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
446         {
447             int x_out = blockIdx.x;
448             int y_out = blockIdx.y % h;
449             int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
450
451             int tid = threadIdx.x;
452
453             const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step + x_out/2;
454             T* data_cost = data_cost_ + y_out * cmsg_step + x_out;
455
456             if (d < nr_plane)
457             {
458                 int sel_disp = selected_disparity[d * cdisp_step2];
459
460                 int x0 = x_out << level;
461                 int y0 = y_out << level;
462
463                 int len = ::min(y0 + winsz, rows) - y0;
464
465                 float val = 0.0f;
466                 if (x0 + tid < cols)
467                 {
468                     if (x0 + tid - sel_disp < 0 || sel_disp < cth)
469                         val = cdata_weight * cmax_data_term * len;
470                     else
471                     {
472                         const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
473                         const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
474
475                         for(int y = 0; y < len; ++y)
476                         {
477                             val += DataCostPerPixel<channels>::compute(lle, lri);
478
479                             lle += cimg_step;
480                             lri += cimg_step;
481                         }
482                     }
483                 }
484
485                 extern __shared__ float smem[];
486
487                 reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
488
489                 if (tid == 0)
490                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
491             }
492         }
493
494         template <typename T>
495         void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
496                                       int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
497         {
498             dim3 threads(32, 8, 1);
499             dim3 grid(1, 1, 1);
500
501             grid.x = divUp(w, threads.x);
502             grid.y = divUp(h, threads.y);
503
504             switch(channels)
505             {
506             case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
507             case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
508             case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
509             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "compute_data_cost_caller_");
510             }
511         }
512
513         template <typename T, int winsz>
514         void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
515                                       int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
516         {
517             const int threadsNum = 256;
518             const size_t smem_size = threadsNum * sizeof(float);
519
520             dim3 threads(winsz, 1, threadsNum / winsz);
521             dim3 grid(w, h, 1);
522             grid.y *= divUp(nr_plane, threads.z);
523
524             switch (channels)
525             {
526             case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
527             case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
528             case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
529             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "compute_data_cost_reduce_caller_");
530             }
531         }
532
533         template<class T>
534         void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
535                                int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
536         {
537             typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
538                 int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
539
540             static const ComputeDataCostCaller callers[] =
541             {
542                 compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
543                 compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
544                 compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
545             };
546
547             size_t disp_step1 = msg_step * h;
548             size_t disp_step2 = msg_step * h2;
549             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
550             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
551             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
552
553             callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
554             cudaSafeCall( cudaGetLastError() );
555
556             if (stream == 0)
557                 cudaSafeCall( cudaDeviceSynchronize() );
558         }
559
560         template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step,
561                                int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
562
563         template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
564                                int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
565
566
567         ///////////////////////////////////////////////////////////////
568         //////////////////////// init message /////////////////////////
569         ///////////////////////////////////////////////////////////////
570
571
572          template <typename T>
573         __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
574                                                      const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
575                                                      T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
576                                                      const T* data_cost_cur, const T* disparity_selected_cur,
577                                                      int nr_plane, int nr_plane2)
578         {
579             for(int i = 0; i < nr_plane; i++)
580             {
581                 T minimum = numeric_limits<T>::max();
582                 int id = 0;
583                 for(int j = 0; j < nr_plane2; j++)
584                 {
585                     T cur = data_cost_new[j * cdisp_step1];
586                     if(cur < minimum)
587                     {
588                         minimum = cur;
589                         id = j;
590                     }
591                 }
592
593                 data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
594                 disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
595
596                 u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
597                 d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
598                 l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
599                 r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
600
601                 data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
602             }
603         }
604
605         template <typename T>
606         __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
607                                      const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
608                                      T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
609                                      T* data_cost_selected_, const T* data_cost_,
610                                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
611         {
612             int x = blockIdx.x * blockDim.x + threadIdx.x;
613             int y = blockIdx.y * blockDim.y + threadIdx.y;
614
615             if (y < h && x < w)
616             {
617                 const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step + x/2;
618                 const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step + x/2;
619                 const T* l_cur = l_cur_ + (y/2)                * cmsg_step + ::min(w2-1, x/2 + 1);
620                 const T* r_cur = r_cur_ + (y/2)                * cmsg_step + ::max(0, x/2 - 1);
621
622                 T* data_cost_new = (T*)ctemp + y * cmsg_step + x;
623
624                 const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step + x/2;
625                 const T* data_cost = data_cost_ + y * cmsg_step + x;
626
627                 for(int d = 0; d < nr_plane2; d++)
628                 {
629                     int idx2 = d * cdisp_step2;
630
631                     T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
632                     data_cost_new[d * cdisp_step1] = val;
633                 }
634
635                 T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
636                 T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step + x;
637
638                 T* u_new = u_new_ + y * cmsg_step + x;
639                 T* d_new = d_new_ + y * cmsg_step + x;
640                 T* l_new = l_new_ + y * cmsg_step + x;
641                 T* r_new = r_new_ + y * cmsg_step + x;
642
643                 u_cur = u_cur_ + y/2 * cmsg_step + x/2;
644                 d_cur = d_cur_ + y/2 * cmsg_step + x/2;
645                 l_cur = l_cur_ + y/2 * cmsg_step + x/2;
646                 r_cur = r_cur_ + y/2 * cmsg_step + x/2;
647
648                 get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
649                                              data_cost_selected, disparity_selected_new, data_cost_new,
650                                              data_cost, disparity_selected_cur, nr_plane, nr_plane2);
651             }
652         }
653
654
655         template<class T>
656         void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
657                           const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
658                           T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
659                           T* data_cost_selected, const T* data_cost, size_t msg_step,
660                           int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
661         {
662
663             size_t disp_step1 = msg_step * h;
664             size_t disp_step2 = msg_step * h2;
665             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
666             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
667             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );
668
669             dim3 threads(32, 8, 1);
670             dim3 grid(1, 1, 1);
671
672             grid.x = divUp(w, threads.x);
673             grid.y = divUp(h, threads.y);
674
675             init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
676                                                        u_cur, d_cur, l_cur, r_cur,
677                                                        selected_disp_pyr_new, selected_disp_pyr_cur,
678                                                        data_cost_selected, data_cost,
679                                                        h, w, nr_plane, h2, w2, nr_plane2);
680             cudaSafeCall( cudaGetLastError() );
681
682             if (stream == 0)
683                 cudaSafeCall( cudaDeviceSynchronize() );
684         }
685
686
687         template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
688                           const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
689                           short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
690                           short* data_cost_selected, const short* data_cost, size_t msg_step,
691                           int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
692
693         template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
694                           const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
695                           float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
696                           float* data_cost_selected, const float* data_cost, size_t msg_step,
697                           int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
698
699         ///////////////////////////////////////////////////////////////
700         ////////////////////  calc all iterations /////////////////////
701         ///////////////////////////////////////////////////////////////
702
703         template <typename T>
704         __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
705                                           const T* dst_disp, const T* src_disp, int nr_plane, volatile T* temp)
706         {
707             T minimum = numeric_limits<T>::max();
708
709             for(int d = 0; d < nr_plane; d++)
710             {
711                 int idx = d * cdisp_step1;
712                 T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
713
714                 if(val < minimum)
715                     minimum = val;
716
717                 msg_dst[idx] = val;
718             }
719
720             float sum = 0;
721             for(int d = 0; d < nr_plane; d++)
722             {
723                 float cost_min = minimum + cmax_disc_term;
724                 T src_disp_reg = src_disp[d * cdisp_step1];
725
726                 for(int d2 = 0; d2 < nr_plane; d2++)
727                     cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
728
729                 temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
730                 sum += cost_min;
731             }
732             sum /= nr_plane;
733
734             for(int d = 0; d < nr_plane; d++)
735                 msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
736         }
737
738         template <typename T>
739         __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
740         {
741             int y = blockIdx.y * blockDim.y + threadIdx.y;
742             int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
743
744             if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
745             {
746                 const T* data = data_cost_selected + y * cmsg_step + x;
747
748                 T* u = u_ + y * cmsg_step + x;
749                 T* d = d_ + y * cmsg_step + x;
750                 T* l = l_ + y * cmsg_step + x;
751                 T* r = r_ + y * cmsg_step + x;
752
753                 const T* disp = selected_disp_pyr_cur + y * cmsg_step + x;
754
755                 T* temp = (T*)ctemp + y * cmsg_step + x;
756
757                 message_per_pixel(data, u, r - 1, u + cmsg_step, l + 1, disp, disp - cmsg_step, nr_plane, temp);
758                 message_per_pixel(data, d, d - cmsg_step, r - 1, l + 1, disp, disp + cmsg_step, nr_plane, temp);
759                 message_per_pixel(data, l, u + cmsg_step, d - cmsg_step, l + 1, disp, disp - 1, nr_plane, temp);
760                 message_per_pixel(data, r, u + cmsg_step, d - cmsg_step, r - 1, disp, disp + 1, nr_plane, temp);
761             }
762         }
763
764
765         template<class T>
766         void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
767             const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
768         {
769             size_t disp_step = msg_step * h;
770             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
771             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
772
773             dim3 threads(32, 8, 1);
774             dim3 grid(1, 1, 1);
775
776             grid.x = divUp(w, threads.x << 1);
777             grid.y = divUp(h, threads.y);
778
779             for(int t = 0; t < iters; ++t)
780             {
781                 compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
782                 cudaSafeCall( cudaGetLastError() );
783             }
784             if (stream == 0)
785                     cudaSafeCall( cudaDeviceSynchronize() );
786         };
787
788         template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
789             int h, int w, int nr_plane, int iters, cudaStream_t stream);
790
791         template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
792             int h, int w, int nr_plane, int iters, cudaStream_t stream);
793
794
795         ///////////////////////////////////////////////////////////////
796         /////////////////////////// output ////////////////////////////
797         ///////////////////////////////////////////////////////////////
798
799
800         template <typename T>
801         __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
802                                      const T* data_cost_selected, const T* disp_selected_pyr,
803                                      PtrStepSz<short> disp, int nr_plane)
804         {
805             int x = blockIdx.x * blockDim.x + threadIdx.x;
806             int y = blockIdx.y * blockDim.y + threadIdx.y;
807
808             if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
809             {
810                 const T* data = data_cost_selected + y * cmsg_step + x;
811                 const T* disp_selected = disp_selected_pyr + y * cmsg_step + x;
812
813                 const T* u = u_ + (y+1) * cmsg_step + (x+0);
814                 const T* d = d_ + (y-1) * cmsg_step + (x+0);
815                 const T* l = l_ + (y+0) * cmsg_step + (x+1);
816                 const T* r = r_ + (y+0) * cmsg_step + (x-1);
817
818                 int best = 0;
819                 T best_val = numeric_limits<T>::max();
820                 for (int i = 0; i < nr_plane; ++i)
821                 {
822                     int idx = i * cdisp_step1;
823                     T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
824
825                     if (val < best_val)
826                     {
827                         best_val = val;
828                         best = saturate_cast<short>(disp_selected[idx]);
829                     }
830                 }
831                 disp(y, x) = best;
832             }
833         }
834
835         template<class T>
836         void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
837             const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
838         {
839             size_t disp_step = disp.rows * msg_step;
840             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
841             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
842
843             dim3 threads(32, 8, 1);
844             dim3 grid(1, 1, 1);
845
846             grid.x = divUp(disp.cols, threads.x);
847             grid.y = divUp(disp.rows, threads.y);
848
849             compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane);
850             cudaSafeCall( cudaGetLastError() );
851
852             if (stream == 0)
853                 cudaSafeCall( cudaDeviceSynchronize() );
854         }
855
856         template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
857             const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
858
859         template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
860             const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
861     } // namespace stereocsbp
862 }}} // namespace cv { namespace gpu { namespace device {
863
864 #endif /* CUDA_DISABLER */