modules/gpu/src/cuda/stereocsbp.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "opencv2/gpu/device/common.hpp"
  46 #include "opencv2/gpu/device/saturate_cast.hpp"
  47 #include "opencv2/gpu/device/limits.hpp"
  48 #include "opencv2/gpu/device/reduce.hpp"
  49 #include "opencv2/gpu/device/functional.hpp"
  50
  51 namespace cv { namespace gpu { namespace device
  52 {
  53     namespace stereocsbp
  54     {
  55         ///////////////////////////////////////////////////////////////
  56         /////////////////////// load constants ////////////////////////
  57         ///////////////////////////////////////////////////////////////
  58
  59         __constant__ int cndisp;
  60
  61         __constant__ float cmax_data_term;
  62         __constant__ float cdata_weight;
  63         __constant__ float cmax_disc_term;
  64         __constant__ float cdisc_single_jump;
  65
  66         __constant__ int cth;
  67
  68         __constant__ size_t cimg_step;
  69         __constant__ size_t cmsg_step;
  70         __constant__ size_t cdisp_step1;
  71         __constant__ size_t cdisp_step2;
  72
  73         __constant__ uchar* cleft;
  74         __constant__ uchar* cright;
  75         __constant__ uchar* ctemp;
  76
  77
  78         void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
  79                             const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
  80         {
  81             cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
  82
  83             cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
  84             cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
  85             cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
  86             cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
  87
  88             cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
  89
  90             cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
  91
  92             cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
  93             cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
  94             cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
  95         }
  96
  97         ///////////////////////////////////////////////////////////////
  98         /////////////////////// init data cost ////////////////////////
  99         ///////////////////////////////////////////////////////////////
 100
 101         template <int channels> struct DataCostPerPixel;
 102         template <> struct DataCostPerPixel<1>
 103         {
 104             static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
 105             {
 106                 return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
 107             }
 108         };
 109         template <> struct DataCostPerPixel<3>
 110         {
 111             static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
 112             {
 113                 float tb = 0.114f * ::abs((int)left[0] - right[0]);
 114                 float tg = 0.587f * ::abs((int)left[1] - right[1]);
 115                 float tr = 0.299f * ::abs((int)left[2] - right[2]);
 116
 117                 return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
 118             }
 119         };
 120         template <> struct DataCostPerPixel<4>
 121         {
 122             static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
 123             {
 124                 uchar4 l = *((const uchar4*)left);
 125                 uchar4 r = *((const uchar4*)right);
 126
 127                 float tb = 0.114f * ::abs((int)l.x - r.x);
 128                 float tg = 0.587f * ::abs((int)l.y - r.y);
 129                 float tr = 0.299f * ::abs((int)l.z - r.z);
 130
 131                 return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
 132             }
 133         };
 134
 135         template <typename T>
 136         __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
 137         {
 138             int x = blockIdx.x * blockDim.x + threadIdx.x;
 139             int y = blockIdx.y * blockDim.y + threadIdx.y;
 140
 141             if (y < h && x < w)
 142             {
 143                 T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
 144                 T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
 145                 T* data_cost = (T*)ctemp + y * cmsg_step + x;
 146
 147                 for(int i = 0; i < nr_plane; i++)
 148                 {
 149                     T minimum = device::numeric_limits<T>::max();
 150                     int id = 0;
 151                     for(int d = 0; d < cndisp; d++)
 152                     {
 153                         T cur = data_cost[d * cdisp_step1];
 154                         if(cur < minimum)
 155                         {
 156                             minimum = cur;
 157                             id = d;
 158                         }
 159                     }
 160
 161                     data_cost_selected[i  * cdisp_step1] = minimum;
 162                     selected_disparity[i  * cdisp_step1] = id;
 163                     data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
 164                 }
 165             }
 166         }
 167
 168
 169         template <typename T>
 170         __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
 171         {
 172             int x = blockIdx.x * blockDim.x + threadIdx.x;
 173             int y = blockIdx.y * blockDim.y + threadIdx.y;
 174
 175             if (y < h && x < w)
 176             {
 177                 T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
 178                 T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
 179                 T* data_cost = (T*)ctemp + y * cmsg_step + x;
 180
 181                 int nr_local_minimum = 0;
 182
 183                 T prev = data_cost[0 * cdisp_step1];
 184                 T cur  = data_cost[1 * cdisp_step1];
 185                 T next = data_cost[2 * cdisp_step1];
 186
 187                 for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
 188                 {
 189                     if (cur < prev && cur < next)
 190                     {
 191                         data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
 192                         selected_disparity[nr_local_minimum * cdisp_step1] = d;
 193
 194                         data_cost[d * cdisp_step1] = numeric_limits<T>::max();
 195
 196                         nr_local_minimum++;
 197                     }
 198                     prev = cur;
 199                     cur = next;
 200                     next = data_cost[(d + 1) * cdisp_step1];
 201                 }
 202
 203                 for (int i = nr_local_minimum; i < nr_plane; i++)
 204                 {
 205                     T minimum = numeric_limits<T>::max();
 206                     int id = 0;
 207
 208                     for (int d = 0; d < cndisp; d++)
 209                     {
 210                         cur = data_cost[d * cdisp_step1];
 211                         if (cur < minimum)
 212                         {
 213                             minimum = cur;
 214                             id = d;
 215                         }
 216                     }
 217                     data_cost_selected[i * cdisp_step1] = minimum;
 218                     selected_disparity[i * cdisp_step1] = id;
 219
 220                     data_cost[id * cdisp_step1] = numeric_limits<T>::max();
 221                 }
 222             }
 223         }
 224
 225         template <typename T, int channels>
 226         __global__ void init_data_cost(int h, int w, int level)
 227         {
 228             int x = blockIdx.x * blockDim.x + threadIdx.x;
 229             int y = blockIdx.y * blockDim.y + threadIdx.y;
 230
 231             if (y < h && x < w)
 232             {
 233                 int y0 = y << level;
 234                 int yt = (y + 1) << level;
 235
 236                 int x0 = x << level;
 237                 int xt = (x + 1) << level;
 238
 239                 T* data_cost = (T*)ctemp + y * cmsg_step + x;
 240
 241                 for(int d = 0; d < cndisp; ++d)
 242                 {
 243                     float val = 0.0f;
 244                     for(int yi = y0; yi < yt; yi++)
 245                     {
 246                         for(int xi = x0; xi < xt; xi++)
 247                         {
 248                             int xr = xi - d;
 249                             if(d < cth || xr < 0)
 250                                 val += cdata_weight * cmax_data_term;
 251                             else
 252                             {
 253                                 const uchar* lle = cleft + yi * cimg_step + xi * channels;
 254                                 const uchar* lri = cright + yi * cimg_step + xr * channels;
 255
 256                                 val += DataCostPerPixel<channels>::compute(lle, lri);
 257                             }
 258                         }
 259                     }
 260                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
 261                 }
 262             }
 263         }
 264
 265         template <typename T, int winsz, int channels>
 266         __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
 267         {
 268             int x_out = blockIdx.x;
 269             int y_out = blockIdx.y % h;
 270             int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
 271
 272             int tid = threadIdx.x;
 273
 274             if (d < cndisp)
 275             {
 276                 int x0 = x_out << level;
 277                 int y0 = y_out << level;
 278
 279                 int len = ::min(y0 + winsz, rows) - y0;
 280
 281                 float val = 0.0f;
 282                 if (x0 + tid < cols)
 283                 {
 284                     if (x0 + tid - d < 0 || d < cth)
 285                         val = cdata_weight * cmax_data_term * len;
 286                     else
 287                     {
 288                         const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
 289                         const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
 290
 291                         for(int y = 0; y < len; ++y)
 292                         {
 293                             val += DataCostPerPixel<channels>::compute(lle, lri);
 294
 295                             lle += cimg_step;
 296                             lri += cimg_step;
 297                         }
 298                     }
 299                 }
 300
 301                 extern __shared__ float smem[];
 302
 303                 reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
 304
 305                 T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
 306
 307                 if (tid == 0)
 308                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
 309             }
 310         }
 311
 312
 313         template <typename T>
 314         void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
 315         {
 316             dim3 threads(32, 8, 1);
 317             dim3 grid(1, 1, 1);
 318
 319             grid.x = divUp(w, threads.x);
 320             grid.y = divUp(h, threads.y);
 321
 322             switch (channels)
 323             {
 324             case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
 325             case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
 326             case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
 327             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "init_data_cost_caller_");
 328             }
 329         }
 330
 331         template <typename T, int winsz>
 332         void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
 333         {
 334             const int threadsNum = 256;
 335             const size_t smem_size = threadsNum * sizeof(float);
 336
 337             dim3 threads(winsz, 1, threadsNum / winsz);
 338             dim3 grid(w, h, 1);
 339             grid.y *= divUp(ndisp, threads.z);
 340
 341             switch (channels)
 342             {
 343             case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
 344             case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
 345             case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
 346             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "init_data_cost_reduce_caller_");
 347             }
 348         }
 349
 350         template<class T>
 351         void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
 352                     int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
 353         {
 354
 355             typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
 356
 357             static const InitDataCostCaller init_data_cost_callers[] =
 358             {
 359                 init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
 360                 init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
 361                 init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
 362             };
 363
 364             size_t disp_step = msg_step * h;
 365             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
 366             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
 367
 368             init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
 369             cudaSafeCall( cudaGetLastError() );
 370
 371             if (stream == 0)
 372                 cudaSafeCall( cudaDeviceSynchronize() );
 373
 374             dim3 threads(32, 8, 1);
 375             dim3 grid(1, 1, 1);
 376
 377             grid.x = divUp(w, threads.x);
 378             grid.y = divUp(h, threads.y);
 379
 380             if (use_local_init_data_cost == true)
 381                 get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
 382             else
 383                 get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
 384
 385             cudaSafeCall( cudaGetLastError() );
 386
 387             if (stream == 0)
 388                 cudaSafeCall( cudaDeviceSynchronize() );
 389         }
 390
 391         template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
 392                     int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
 393
 394         template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
 395                     int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
 396
 397         ///////////////////////////////////////////////////////////////
 398         ////////////////////// compute data cost //////////////////////
 399         ///////////////////////////////////////////////////////////////
 400
 401         template <typename T, int channels>
 402         __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
 403         {
 404             int x = blockIdx.x * blockDim.x + threadIdx.x;
 405             int y = blockIdx.y * blockDim.y + threadIdx.y;
 406
 407             if (y < h && x < w)
 408             {
 409                 int y0 = y << level;
 410                 int yt = (y + 1) << level;
 411
 412                 int x0 = x << level;
 413                 int xt = (x + 1) << level;
 414
 415                 const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step + x/2;
 416                 T* data_cost = data_cost_ + y * cmsg_step + x;
 417
 418                 for(int d = 0; d < nr_plane; d++)
 419                 {
 420                     float val = 0.0f;
 421                     for(int yi = y0; yi < yt; yi++)
 422                     {
 423                         for(int xi = x0; xi < xt; xi++)
 424                         {
 425                             int sel_disp = selected_disparity[d * cdisp_step2];
 426                             int xr = xi - sel_disp;
 427
 428                             if (xr < 0 || sel_disp < cth)
 429                                 val += cdata_weight * cmax_data_term;
 430                             else
 431                             {
 432                                 const uchar* left_x = cleft + yi * cimg_step + xi * channels;
 433                                 const uchar* right_x = cright + yi * cimg_step + xr * channels;
 434
 435                                 val += DataCostPerPixel<channels>::compute(left_x, right_x);
 436                             }
 437                         }
 438                     }
 439                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
 440                 }
 441             }
 442         }
 443
 444         template <typename T, int winsz, int channels>
 445         __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
 446         {
 447             int x_out = blockIdx.x;
 448             int y_out = blockIdx.y % h;
 449             int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
 450
 451             int tid = threadIdx.x;
 452
 453             const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step + x_out/2;
 454             T* data_cost = data_cost_ + y_out * cmsg_step + x_out;
 455
 456             if (d < nr_plane)
 457             {
 458                 int sel_disp = selected_disparity[d * cdisp_step2];
 459
 460                 int x0 = x_out << level;
 461                 int y0 = y_out << level;
 462
 463                 int len = ::min(y0 + winsz, rows) - y0;
 464
 465                 float val = 0.0f;
 466                 if (x0 + tid < cols)
 467                 {
 468                     if (x0 + tid - sel_disp < 0 || sel_disp < cth)
 469                         val = cdata_weight * cmax_data_term * len;
 470                     else
 471                     {
 472                         const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
 473                         const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
 474
 475                         for(int y = 0; y < len; ++y)
 476                         {
 477                             val += DataCostPerPixel<channels>::compute(lle, lri);
 478
 479                             lle += cimg_step;
 480                             lri += cimg_step;
 481                         }
 482                     }
 483                 }
 484
 485                 extern __shared__ float smem[];
 486
 487                 reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
 488
 489                 if (tid == 0)
 490                     data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
 491             }
 492         }
 493
 494         template <typename T>
 495         void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
 496                                       int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
 497         {
 498             dim3 threads(32, 8, 1);
 499             dim3 grid(1, 1, 1);
 500
 501             grid.x = divUp(w, threads.x);
 502             grid.y = divUp(h, threads.y);
 503
 504             switch(channels)
 505             {
 506             case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
 507             case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
 508             case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
 509             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "compute_data_cost_caller_");
 510             }
 511         }
 512
 513         template <typename T, int winsz>
 514         void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
 515                                       int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
 516         {
 517             const int threadsNum = 256;
 518             const size_t smem_size = threadsNum * sizeof(float);
 519
 520             dim3 threads(winsz, 1, threadsNum / winsz);
 521             dim3 grid(w, h, 1);
 522             grid.y *= divUp(nr_plane, threads.z);
 523
 524             switch (channels)
 525             {
 526             case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
 527             case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
 528             case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
 529             default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "compute_data_cost_reduce_caller_");
 530             }
 531         }
 532
 533         template<class T>
 534         void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
 535                                int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
 536         {
 537             typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
 538                 int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
 539
 540             static const ComputeDataCostCaller callers[] =
 541             {
 542                 compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
 543                 compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
 544                 compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
 545             };
 546
 547             size_t disp_step1 = msg_step * h;
 548             size_t disp_step2 = msg_step * h2;
 549             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
 550             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
 551             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
 552
 553             callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
 554             cudaSafeCall( cudaGetLastError() );
 555
 556             if (stream == 0)
 557                 cudaSafeCall( cudaDeviceSynchronize() );
 558         }
 559
 560         template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step,
 561                                int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
 562
 563         template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
 564                                int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
 565
 566
 567         ///////////////////////////////////////////////////////////////
 568         //////////////////////// init message /////////////////////////
 569         ///////////////////////////////////////////////////////////////
 570
 571
 572          template <typename T>
 573         __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
 574                                                      const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
 575                                                      T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
 576                                                      const T* data_cost_cur, const T* disparity_selected_cur,
 577                                                      int nr_plane, int nr_plane2)
 578         {
 579             for(int i = 0; i < nr_plane; i++)
 580             {
 581                 T minimum = numeric_limits<T>::max();
 582                 int id = 0;
 583                 for(int j = 0; j < nr_plane2; j++)
 584                 {
 585                     T cur = data_cost_new[j * cdisp_step1];
 586                     if(cur < minimum)
 587                     {
 588                         minimum = cur;
 589                         id = j;
 590                     }
 591                 }
 592
 593                 data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
 594                 disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
 595
 596                 u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
 597                 d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
 598                 l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
 599                 r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
 600
 601                 data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
 602             }
 603         }
 604
 605         template <typename T>
 606         __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
 607                                      const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
 608                                      T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
 609                                      T* data_cost_selected_, const T* data_cost_,
 610                                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
 611         {
 612             int x = blockIdx.x * blockDim.x + threadIdx.x;
 613             int y = blockIdx.y * blockDim.y + threadIdx.y;
 614
 615             if (y < h && x < w)
 616             {
 617                 const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step + x/2;
 618                 const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step + x/2;
 619                 const T* l_cur = l_cur_ + (y/2)                * cmsg_step + ::min(w2-1, x/2 + 1);
 620                 const T* r_cur = r_cur_ + (y/2)                * cmsg_step + ::max(0, x/2 - 1);
 621
 622                 T* data_cost_new = (T*)ctemp + y * cmsg_step + x;
 623
 624                 const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step + x/2;
 625                 const T* data_cost = data_cost_ + y * cmsg_step + x;
 626
 627                 for(int d = 0; d < nr_plane2; d++)
 628                 {
 629                     int idx2 = d * cdisp_step2;
 630
 631                     T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
 632                     data_cost_new[d * cdisp_step1] = val;
 633                 }
 634
 635                 T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
 636                 T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step + x;
 637
 638                 T* u_new = u_new_ + y * cmsg_step + x;
 639                 T* d_new = d_new_ + y * cmsg_step + x;
 640                 T* l_new = l_new_ + y * cmsg_step + x;
 641                 T* r_new = r_new_ + y * cmsg_step + x;
 642
 643                 u_cur = u_cur_ + y/2 * cmsg_step + x/2;
 644                 d_cur = d_cur_ + y/2 * cmsg_step + x/2;
 645                 l_cur = l_cur_ + y/2 * cmsg_step + x/2;
 646                 r_cur = r_cur_ + y/2 * cmsg_step + x/2;
 647
 648                 get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
 649                                              data_cost_selected, disparity_selected_new, data_cost_new,
 650                                              data_cost, disparity_selected_cur, nr_plane, nr_plane2);
 651             }
 652         }
 653
 654
 655         template<class T>
 656         void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
 657                           const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
 658                           T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
 659                           T* data_cost_selected, const T* data_cost, size_t msg_step,
 660                           int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
 661         {
 662
 663             size_t disp_step1 = msg_step * h;
 664             size_t disp_step2 = msg_step * h2;
 665             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
 666             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
 667             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );
 668
 669             dim3 threads(32, 8, 1);
 670             dim3 grid(1, 1, 1);
 671
 672             grid.x = divUp(w, threads.x);
 673             grid.y = divUp(h, threads.y);
 674
 675             init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
 676                                                        u_cur, d_cur, l_cur, r_cur,
 677                                                        selected_disp_pyr_new, selected_disp_pyr_cur,
 678                                                        data_cost_selected, data_cost,
 679                                                        h, w, nr_plane, h2, w2, nr_plane2);
 680             cudaSafeCall( cudaGetLastError() );
 681
 682             if (stream == 0)
 683                 cudaSafeCall( cudaDeviceSynchronize() );
 684         }
 685
 686
 687         template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
 688                           const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
 689                           short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
 690                           short* data_cost_selected, const short* data_cost, size_t msg_step,
 691                           int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
 692
 693         template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
 694                           const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
 695                           float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
 696                           float* data_cost_selected, const float* data_cost, size_t msg_step,
 697                           int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
 698
 699         ///////////////////////////////////////////////////////////////
 700         ////////////////////  calc all iterations /////////////////////
 701         ///////////////////////////////////////////////////////////////
 702
 703         template <typename T>
 704         __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
 705                                           const T* dst_disp, const T* src_disp, int nr_plane, volatile T* temp)
 706         {
 707             T minimum = numeric_limits<T>::max();
 708
 709             for(int d = 0; d < nr_plane; d++)
 710             {
 711                 int idx = d * cdisp_step1;
 712                 T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
 713
 714                 if(val < minimum)
 715                     minimum = val;
 716
 717                 msg_dst[idx] = val;
 718             }
 719
 720             float sum = 0;
 721             for(int d = 0; d < nr_plane; d++)
 722             {
 723                 float cost_min = minimum + cmax_disc_term;
 724                 T src_disp_reg = src_disp[d * cdisp_step1];
 725
 726                 for(int d2 = 0; d2 < nr_plane; d2++)
 727                     cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
 728
 729                 temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
 730                 sum += cost_min;
 731             }
 732             sum /= nr_plane;
 733
 734             for(int d = 0; d < nr_plane; d++)
 735                 msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
 736         }
 737
 738         template <typename T>
 739         __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
 740         {
 741             int y = blockIdx.y * blockDim.y + threadIdx.y;
 742             int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
 743
 744             if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
 745             {
 746                 const T* data = data_cost_selected + y * cmsg_step + x;
 747
 748                 T* u = u_ + y * cmsg_step + x;
 749                 T* d = d_ + y * cmsg_step + x;
 750                 T* l = l_ + y * cmsg_step + x;
 751                 T* r = r_ + y * cmsg_step + x;
 752
 753                 const T* disp = selected_disp_pyr_cur + y * cmsg_step + x;
 754
 755                 T* temp = (T*)ctemp + y * cmsg_step + x;
 756
 757                 message_per_pixel(data, u, r - 1, u + cmsg_step, l + 1, disp, disp - cmsg_step, nr_plane, temp);
 758                 message_per_pixel(data, d, d - cmsg_step, r - 1, l + 1, disp, disp + cmsg_step, nr_plane, temp);
 759                 message_per_pixel(data, l, u + cmsg_step, d - cmsg_step, l + 1, disp, disp - 1, nr_plane, temp);
 760                 message_per_pixel(data, r, u + cmsg_step, d - cmsg_step, r - 1, disp, disp + 1, nr_plane, temp);
 761             }
 762         }
 763
 764
 765         template<class T>
 766         void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
 767             const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
 768         {
 769             size_t disp_step = msg_step * h;
 770             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
 771             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
 772
 773             dim3 threads(32, 8, 1);
 774             dim3 grid(1, 1, 1);
 775
 776             grid.x = divUp(w, threads.x << 1);
 777             grid.y = divUp(h, threads.y);
 778
 779             for(int t = 0; t < iters; ++t)
 780             {
 781                 compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
 782                 cudaSafeCall( cudaGetLastError() );
 783             }
 784             if (stream == 0)
 785                     cudaSafeCall( cudaDeviceSynchronize() );
 786         };
 787
 788         template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
 789             int h, int w, int nr_plane, int iters, cudaStream_t stream);
 790
 791         template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
 792             int h, int w, int nr_plane, int iters, cudaStream_t stream);
 793
 794
 795         ///////////////////////////////////////////////////////////////
 796         /////////////////////////// output ////////////////////////////
 797         ///////////////////////////////////////////////////////////////
 798
 799
 800         template <typename T>
 801         __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
 802                                      const T* data_cost_selected, const T* disp_selected_pyr,
 803                                      PtrStepSz<short> disp, int nr_plane)
 804         {
 805             int x = blockIdx.x * blockDim.x + threadIdx.x;
 806             int y = blockIdx.y * blockDim.y + threadIdx.y;
 807
 808             if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
 809             {
 810                 const T* data = data_cost_selected + y * cmsg_step + x;
 811                 const T* disp_selected = disp_selected_pyr + y * cmsg_step + x;
 812
 813                 const T* u = u_ + (y+1) * cmsg_step + (x+0);
 814                 const T* d = d_ + (y-1) * cmsg_step + (x+0);
 815                 const T* l = l_ + (y+0) * cmsg_step + (x+1);
 816                 const T* r = r_ + (y+0) * cmsg_step + (x-1);
 817
 818                 int best = 0;
 819                 T best_val = numeric_limits<T>::max();
 820                 for (int i = 0; i < nr_plane; ++i)
 821                 {
 822                     int idx = i * cdisp_step1;
 823                     T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
 824
 825                     if (val < best_val)
 826                     {
 827                         best_val = val;
 828                         best = saturate_cast<short>(disp_selected[idx]);
 829                     }
 830                 }
 831                 disp(y, x) = best;
 832             }
 833         }
 834
 835         template<class T>
 836         void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
 837             const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
 838         {
 839             size_t disp_step = disp.rows * msg_step;
 840             cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
 841             cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
 842
 843             dim3 threads(32, 8, 1);
 844             dim3 grid(1, 1, 1);
 845
 846             grid.x = divUp(disp.cols, threads.x);
 847             grid.y = divUp(disp.rows, threads.y);
 848
 849             compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane);
 850             cudaSafeCall( cudaGetLastError() );
 851
 852             if (stream == 0)
 853                 cudaSafeCall( cudaDeviceSynchronize() );
 854         }
 855
 856         template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
 857             const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
 858
 859         template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
 860             const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
 861     } // namespace stereocsbp
 862 }}} // namespace cv { namespace gpu { namespace device {
 863
 864 #endif /* CUDA_DISABLER */