modules/gpu/src/cuda/optical_flow_farneback.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "internal_shared.hpp"
  46 #include "opencv2/gpu/device/common.hpp"
  47 #include "opencv2/gpu/device/border_interpolate.hpp"
  48
  49 #define tx threadIdx.x
  50 #define ty threadIdx.y
  51 #define bx blockIdx.x
  52 #define by blockIdx.y
  53 #define bdx blockDim.x
  54 #define bdy blockDim.y
  55
  56 #define BORDER_SIZE 5
  57 #define MAX_KSIZE_HALF 100
  58
  59 namespace cv { namespace gpu { namespace device { namespace optflow_farneback
  60 {
  61     __constant__ float c_g[8];
  62     __constant__ float c_xg[8];
  63     __constant__ float c_xxg[8];
  64     __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
  65
  66
  67     template <int polyN>
  68     __global__ void polynomialExpansion(
  69             const int height, const int width, const PtrStepf src, PtrStepf dst)
  70     {
  71         const int y = by * bdy + ty;
  72         const int x = bx * (bdx - 2*polyN) + tx - polyN;
  73
  74         if (y < height)
  75         {
  76             extern __shared__ float smem[];
  77             volatile float *row = smem + tx;
  78             int xWarped = ::min(::max(x, 0), width - 1);
  79
  80             row[0] = src(y, xWarped) * c_g[0];
  81             row[bdx] = 0.f;
  82             row[2*bdx] = 0.f;
  83
  84             for (int k = 1; k <= polyN; ++k)
  85             {
  86                 float t0 = src(::max(y - k, 0), xWarped);
  87                 float t1 = src(::min(y + k, height - 1), xWarped);
  88
  89                 row[0] += c_g[k] * (t0 + t1);
  90                 row[bdx] += c_xg[k] * (t1 - t0);
  91                 row[2*bdx] += c_xxg[k] * (t0 + t1);
  92             }
  93
  94             __syncthreads();
  95
  96             if (tx >= polyN && tx + polyN < bdx && x < width)
  97             {
  98                 float b1 = c_g[0] * row[0];
  99                 float b3 = c_g[0] * row[bdx];
 100                 float b5 = c_g[0] * row[2*bdx];
 101                 float b2 = 0, b4 = 0, b6 = 0;
 102
 103                 for (int k = 1; k <= polyN; ++k)
 104                 {
 105                     b1 += (row[k] + row[-k]) * c_g[k];
 106                     b4 += (row[k] + row[-k]) * c_xxg[k];
 107                     b2 += (row[k] - row[-k]) * c_xg[k];
 108                     b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
 109                     b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
 110                     b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
 111                 }
 112
 113                 dst(y, xWarped) = b3*c_ig11;
 114                 dst(height + y, xWarped) = b2*c_ig11;
 115                 dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
 116                 dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
 117                 dst(4*height + y, xWarped) = b6*c_ig55;
 118             }
 119         }
 120     }
 121
 122
 123     void setPolynomialExpansionConsts(
 124             int polyN, const float *g, const float *xg, const float *xxg,
 125             float ig11, float ig03, float ig33, float ig55)
 126     {
 127         cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
 128         cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
 129         cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
 130         cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
 131         cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
 132         cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
 133         cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
 134     }
 135
 136
 137     void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
 138     {
 139         dim3 block(256);
 140         dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
 141         int smem = 3 * block.x * sizeof(float);
 142
 143         if (polyN == 5)
 144             polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
 145         else if (polyN == 7)
 146             polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
 147
 148         cudaSafeCall(cudaGetLastError());
 149
 150         if (stream == 0)
 151             cudaSafeCall(cudaDeviceSynchronize());
 152     }
 153
 154
 155     __constant__ float c_border[BORDER_SIZE + 1];
 156
 157     __global__ void updateMatrices(
 158             const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
 159             const PtrStepf R0, const PtrStepf R1, PtrStepf M)
 160     {
 161         const int y = by * bdy + ty;
 162         const int x = bx * bdx + tx;
 163
 164         if (y < height && x < width)
 165         {
 166             float dx = flowx(y, x);
 167             float dy = flowy(y, x);
 168             float fx = x + dx;
 169             float fy = y + dy;
 170
 171             int x1 = floorf(fx);
 172             int y1 = floorf(fy);
 173             fx -= x1; fy -= y1;
 174
 175             float r2, r3, r4, r5, r6;
 176
 177             if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
 178             {
 179                 float a00 = (1.f - fx) * (1.f - fy);
 180                 float a01 = fx * (1.f - fy);
 181                 float a10 = (1.f - fx) * fy;
 182                 float a11 = fx * fy;
 183
 184                 r2 = a00 * R1(y1, x1) +
 185                      a01 * R1(y1, x1 + 1) +
 186                      a10 * R1(y1 + 1, x1) +
 187                      a11 * R1(y1 + 1, x1 + 1);
 188
 189                 r3 = a00 * R1(height + y1, x1) +
 190                      a01 * R1(height + y1, x1 + 1) +
 191                      a10 * R1(height + y1 + 1, x1) +
 192                      a11 * R1(height + y1 + 1, x1 + 1);
 193
 194                 r4 = a00 * R1(2*height + y1, x1) +
 195                      a01 * R1(2*height + y1, x1 + 1) +
 196                      a10 * R1(2*height + y1 + 1, x1) +
 197                      a11 * R1(2*height + y1 + 1, x1 + 1);
 198
 199                 r5 = a00 * R1(3*height + y1, x1) +
 200                      a01 * R1(3*height + y1, x1 + 1) +
 201                      a10 * R1(3*height + y1 + 1, x1) +
 202                      a11 * R1(3*height + y1 + 1, x1 + 1);
 203
 204                 r6 = a00 * R1(4*height + y1, x1) +
 205                      a01 * R1(4*height + y1, x1 + 1) +
 206                      a10 * R1(4*height + y1 + 1, x1) +
 207                      a11 * R1(4*height + y1 + 1, x1 + 1);
 208
 209                 r4 = (R0(2*height + y, x) + r4) * 0.5f;
 210                 r5 = (R0(3*height + y, x) + r5) * 0.5f;
 211                 r6 = (R0(4*height + y, x) + r6) * 0.25f;
 212             }
 213             else
 214             {
 215                 r2 = r3 = 0.f;
 216                 r4 = R0(2*height + y, x);
 217                 r5 = R0(3*height + y, x);
 218                 r6 = R0(4*height + y, x) * 0.5f;
 219             }
 220
 221             r2 = (R0(y, x) - r2) * 0.5f;
 222             r3 = (R0(height + y, x) - r3) * 0.5f;
 223
 224             r2 += r4*dy + r6*dx;
 225             r3 += r6*dy + r5*dx;
 226
 227             float scale =
 228                     c_border[::min(x, BORDER_SIZE)] *
 229                     c_border[::min(y, BORDER_SIZE)] *
 230                     c_border[::min(width - x - 1, BORDER_SIZE)] *
 231                     c_border[::min(height - y - 1, BORDER_SIZE)];
 232
 233             r2 *= scale; r3 *= scale; r4 *= scale;
 234             r5 *= scale; r6 *= scale;
 235
 236             M(y, x) = r4*r4 + r6*r6;
 237             M(height + y, x) = (r4 + r5)*r6;
 238             M(2*height + y, x) = r5*r5 + r6*r6;
 239             M(3*height + y, x) = r4*r2 + r6*r3;
 240             M(4*height + y, x) = r6*r2 + r5*r3;
 241         }
 242     }
 243
 244
 245     void setUpdateMatricesConsts()
 246     {
 247         static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
 248         cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
 249     }
 250
 251
 252     void updateMatricesGpu(
 253             const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
 254             PtrStepSzf M, cudaStream_t stream)
 255     {
 256         dim3 block(32, 8);
 257         dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
 258
 259         updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
 260
 261         cudaSafeCall(cudaGetLastError());
 262
 263         if (stream == 0)
 264             cudaSafeCall(cudaDeviceSynchronize());
 265     }
 266
 267
 268     __global__ void updateFlow(
 269             const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
 270     {
 271         const int y = by * bdy + ty;
 272         const int x = bx * bdx + tx;
 273
 274         if (y < height && x < width)
 275         {
 276             float g11 = M(y, x);
 277             float g12 = M(height + y, x);
 278             float g22 = M(2*height + y, x);
 279             float h1 = M(3*height + y, x);
 280             float h2 = M(4*height + y, x);
 281
 282             float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
 283
 284             flowx(y, x) = (g11*h2 - g12*h1) * detInv;
 285             flowy(y, x) = (g22*h1 - g12*h2) * detInv;
 286         }
 287     }
 288
 289
 290     void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
 291     {
 292         dim3 block(32, 8);
 293         dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
 294
 295         updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
 296
 297         cudaSafeCall(cudaGetLastError());
 298
 299         if (stream == 0)
 300             cudaSafeCall(cudaDeviceSynchronize());
 301     }
 302
 303
 304     /*__global__ void boxFilter(
 305             const int height, const int width, const PtrStepf src,
 306             const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
 307     {
 308         const int y = by * bdy + ty;
 309         const int x = bx * bdx + tx;
 310
 311         extern __shared__ float smem[];
 312         volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
 313
 314         if (y < height)
 315         {
 316             // Vertical pass
 317             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
 318             {
 319                 int xExt = int(bx * bdx) + i - ksizeHalf;
 320                 xExt = ::min(::max(xExt, 0), width - 1);
 321
 322                 row[i] = src(y, xExt);
 323                 for (int j = 1; j <= ksizeHalf; ++j)
 324                     row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
 325             }
 326
 327             if (x < width)
 328             {
 329                 __syncthreads();
 330
 331                 // Horizontal passs
 332                 row += tx + ksizeHalf;
 333                 float res = row[0];
 334                 for (int i = 1; i <= ksizeHalf; ++i)
 335                     res += row[-i] + row[i];
 336                 dst(y, x) = res * boxAreaInv;
 337             }
 338         }
 339     }
 340
 341
 342     void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
 343     {
 344         dim3 block(256);
 345         dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 346         int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
 347
 348         float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
 349         boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
 350
 351         cudaSafeCall(cudaGetLastError());
 352
 353         if (stream == 0)
 354             cudaSafeCall(cudaDeviceSynchronize());
 355     }*/
 356
 357
 358     __global__ void boxFilter5(
 359             const int height, const int width, const PtrStepf src,
 360             const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
 361     {
 362         const int y = by * bdy + ty;
 363         const int x = bx * bdx + tx;
 364
 365         extern __shared__ float smem[];
 366
 367         const int smw = bdx + 2*ksizeHalf; // shared memory "width"
 368         volatile float *row = smem + 5 * ty * smw;
 369
 370         if (y < height)
 371         {
 372             // Vertical pass
 373             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
 374             {
 375                 int xExt = int(bx * bdx) + i - ksizeHalf;
 376                 xExt = ::min(::max(xExt, 0), width - 1);
 377
 378                 #pragma unroll
 379                 for (int k = 0; k < 5; ++k)
 380                     row[k*smw + i] = src(k*height + y, xExt);
 381
 382                 for (int j = 1; j <= ksizeHalf; ++j)
 383                     #pragma unroll
 384                     for (int k = 0; k < 5; ++k)
 385                         row[k*smw + i] +=
 386                                 src(k*height + ::max(y - j, 0), xExt) +
 387                                 src(k*height + ::min(y + j, height - 1), xExt);
 388             }
 389
 390             if (x < width)
 391             {
 392                 __syncthreads();
 393
 394                 // Horizontal passs
 395
 396                 row += tx + ksizeHalf;
 397                 float res[5];
 398
 399                 #pragma unroll
 400                 for (int k = 0; k < 5; ++k)
 401                     res[k] = row[k*smw];
 402
 403                 for (int i = 1; i <= ksizeHalf; ++i)
 404                     #pragma unroll
 405                     for (int k = 0; k < 5; ++k)
 406                         res[k] += row[k*smw - i] + row[k*smw + i];
 407
 408                 #pragma unroll
 409                 for (int k = 0; k < 5; ++k)
 410                     dst(k*height + y, x) = res[k] * boxAreaInv;
 411             }
 412         }
 413     }
 414
 415
 416     void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
 417     {
 418         int height = src.rows / 5;
 419         int width = src.cols;
 420
 421         dim3 block(256);
 422         dim3 grid(divUp(width, block.x), divUp(height, block.y));
 423         int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
 424
 425         float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
 426         boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
 427
 428         cudaSafeCall(cudaGetLastError());
 429
 430         if (stream == 0)
 431             cudaSafeCall(cudaDeviceSynchronize());
 432     }
 433
 434
 435     void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
 436     {
 437         int height = src.rows / 5;
 438         int width = src.cols;
 439
 440         dim3 block(128);
 441         dim3 grid(divUp(width, block.x), divUp(height, block.y));
 442         int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
 443
 444         float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
 445         boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
 446
 447         cudaSafeCall(cudaGetLastError());
 448
 449         if (stream == 0)
 450             cudaSafeCall(cudaDeviceSynchronize());
 451     }
 452
 453
 454     __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
 455
 456     template <typename Border>
 457     __global__ void gaussianBlur(
 458             const int height, const int width, const PtrStepf src, const int ksizeHalf,
 459             const Border b, PtrStepf dst)
 460     {
 461         const int y = by * bdy + ty;
 462         const int x = bx * bdx + tx;
 463
 464         extern __shared__ float smem[];
 465         volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
 466
 467         if (y < height)
 468         {
 469             // Vertical pass
 470             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
 471             {
 472                 int xExt = int(bx * bdx) + i - ksizeHalf;
 473                 xExt = b.idx_col(xExt);
 474                 row[i] = src(y, xExt) * c_gKer[0];
 475                 for (int j = 1; j <= ksizeHalf; ++j)
 476                     row[i] +=
 477                             (src(b.idx_row_low(y - j), xExt) +
 478                              src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
 479             }
 480
 481             if (x < width)
 482             {
 483                 __syncthreads();
 484
 485                 // Horizontal pass
 486                 row += tx + ksizeHalf;
 487                 float res = row[0] * c_gKer[0];
 488                 for (int i = 1; i <= ksizeHalf; ++i)
 489                     res += (row[-i] + row[i]) * c_gKer[i];
 490                 dst(y, x) = res;
 491             }
 492         }
 493     }
 494
 495
 496     void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
 497     {
 498         cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
 499     }
 500
 501
 502     template <typename Border>
 503     void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
 504     {
 505         int height = src.rows;
 506         int width = src.cols;
 507
 508         dim3 block(256);
 509         dim3 grid(divUp(width, block.x), divUp(height, block.y));
 510         int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
 511         Border b(height, width);
 512
 513         gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
 514
 515         cudaSafeCall(cudaGetLastError());
 516
 517         if (stream == 0)
 518             cudaSafeCall(cudaDeviceSynchronize());
 519     }
 520
 521
 522     void gaussianBlurGpu(
 523             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
 524     {
 525         typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
 526
 527         static const caller_t callers[] =
 528         {
 529             gaussianBlurCaller<BrdReflect101<float> >,
 530             gaussianBlurCaller<BrdReplicate<float> >,
 531         };
 532
 533         callers[borderMode](src, ksizeHalf, dst, stream);
 534     }
 535
 536
 537     template <typename Border>
 538     __global__ void gaussianBlur5(
 539             const int height, const int width, const PtrStepf src, const int ksizeHalf,
 540             const Border b, PtrStepf dst)
 541     {
 542         const int y = by * bdy + ty;
 543         const int x = bx * bdx + tx;
 544
 545         extern __shared__ float smem[];
 546
 547         const int smw = bdx + 2*ksizeHalf; // shared memory "width"
 548         volatile float *row = smem + 5 * ty * smw;
 549
 550         if (y < height)
 551         {
 552             // Vertical pass
 553             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
 554             {
 555                 int xExt = int(bx * bdx) + i - ksizeHalf;
 556                 xExt = b.idx_col(xExt);
 557
 558                 #pragma unroll
 559                 for (int k = 0; k < 5; ++k)
 560                     row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
 561
 562                 for (int j = 1; j <= ksizeHalf; ++j)
 563                     #pragma unroll
 564                     for (int k = 0; k < 5; ++k)
 565                         row[k*smw + i] +=
 566                                 (src(k*height + b.idx_row_low(y - j), xExt) +
 567                                  src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
 568             }
 569
 570             if (x < width)
 571             {
 572                 __syncthreads();
 573
 574                 // Horizontal pass
 575
 576                 row += tx + ksizeHalf;
 577                 float res[5];
 578
 579                 #pragma unroll
 580                 for (int k = 0; k < 5; ++k)
 581                     res[k] = row[k*smw] * c_gKer[0];
 582
 583                 for (int i = 1; i <= ksizeHalf; ++i)
 584                     #pragma unroll
 585                     for (int k = 0; k < 5; ++k)
 586                         res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
 587
 588                 #pragma unroll
 589                 for (int k = 0; k < 5; ++k)
 590                     dst(k*height + y, x) = res[k];
 591             }
 592         }
 593     }
 594
 595
 596     template <typename Border, int blockDimX>
 597     void gaussianBlur5Caller(
 598             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
 599     {
 600         int height = src.rows / 5;
 601         int width = src.cols;
 602
 603         dim3 block(blockDimX);
 604         dim3 grid(divUp(width, block.x), divUp(height, block.y));
 605         int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
 606         Border b(height, width);
 607
 608         gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
 609
 610         cudaSafeCall(cudaGetLastError());
 611
 612         if (stream == 0)
 613             cudaSafeCall(cudaDeviceSynchronize());
 614     }
 615
 616
 617     void gaussianBlur5Gpu(
 618             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
 619     {
 620         typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
 621
 622         static const caller_t callers[] =
 623         {
 624             gaussianBlur5Caller<BrdReflect101<float>,256>,
 625             gaussianBlur5Caller<BrdReplicate<float>,256>,
 626         };
 627
 628         callers[borderMode](src, ksizeHalf, dst, stream);
 629     }
 630
 631     void gaussianBlur5Gpu_CC11(
 632             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
 633     {
 634         typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
 635
 636         static const caller_t callers[] =
 637         {
 638             gaussianBlur5Caller<BrdReflect101<float>,128>,
 639             gaussianBlur5Caller<BrdReplicate<float>,128>,
 640         };
 641
 642         callers[borderMode](src, ksizeHalf, dst, stream);
 643     }
 644
 645 }}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
 646
 647
 648 #endif /* CUDA_DISABLER */