modules/gpu/src/cuda/imgproc.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////\r
   2 //\r
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
   4 //\r
   5 //  By downloading, copying, installing or using the software you agree to this license.\r
   6 //  If you do not agree to this license, do not download, install,\r
   7 //  copy or use the software.\r
   8 //\r
   9 //\r
  10 //                           License Agreement\r
  11 //                For Open Source Computer Vision Library\r
  12 //\r
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
  15 // Third party copyrights are property of their respective owners.\r
  16 //\r
  17 // Redistribution and use in source and binary forms, with or without modification,\r
  18 // are permitted provided that the following conditions are met:\r
  19 //\r
  20 //   * Redistribution's of source code must retain the above copyright notice,\r
  21 //     this list of conditions and the following disclaimer.\r
  22 //\r
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,\r
  24 //     this list of conditions and the following disclaimer in the documentation\r
  25 //     and/or other materials provided with the distribution.\r
  26 //\r
  27 //   * The name of the copyright holders may not be used to endorse or promote products\r
  28 //     derived from this software without specific prior written permission.\r
  29 //\r
  30 // This software is provided by the copyright holders and contributors "as is" and\r
  31 // any express or implied warranties, including, but not limited to, the implied\r
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.\r
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,\r
  34 // indirect, incidental, special, exemplary, or consequential damages\r
  35 // (including, but not limited to, procurement of substitute goods or services;\r
  36 // loss of use, data, or profits; or business interruption) however caused\r
  37 // and on any theory of liability, whether in contract, strict liability,\r
  38 // or tort (including negligence or otherwise) arising in any way out of\r
  39 // the use of this software, even if advised of the possibility of such damage.\r
  40 //\r
  41 //M*/\r
  42 \r
  43 #include "internal_shared.hpp"\r
  44 #include "opencv2/gpu/device/vec_traits.hpp"\r
  45 #include "opencv2/gpu/device/vec_math.hpp"\r
  46 #include "opencv2/gpu/device/saturate_cast.hpp"\r
  47 #include "opencv2/gpu/device/border_interpolate.hpp"\r
  48 \r
  49 using namespace cv::gpu;\r
  50 using namespace cv::gpu::device;\r
  51 \r
  52 namespace cv { namespace gpu { namespace imgproc\r
  53 {\r
  54 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////\r
  55 \r
  56     texture<uchar4, 2> tex_meanshift;\r
  57 \r
  58     __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, \r
  59                                     size_t out_step, int cols, int rows, \r
  60                                     int sp, int sr, int maxIter, float eps)\r
  61     {\r
  62         int isr2 = sr*sr;\r
  63         uchar4 c = tex2D(tex_meanshift, x0, y0 );\r
  64 \r
  65         // iterate meanshift procedure\r
  66         for( int iter = 0; iter < maxIter; iter++ )\r
  67         {\r
  68             int count = 0;\r
  69             int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;\r
  70             float icount;\r
  71 \r
  72             //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)\r
  73             int minx = x0-sp;\r
  74             int miny = y0-sp;\r
  75             int maxx = x0+sp;\r
  76             int maxy = y0+sp;\r
  77 \r
  78             for( int y = miny; y <= maxy; y++)\r
  79             {\r
  80                 int rowCount = 0;\r
  81                 for( int x = minx; x <= maxx; x++ )\r
  82                 {                    \r
  83                     uchar4 t = tex2D( tex_meanshift, x, y );\r
  84 \r
  85                     int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);\r
  86                     if( norm2 <= isr2 )\r
  87                     {\r
  88                         s0 += t.x; s1 += t.y; s2 += t.z;\r
  89                         sx += x; rowCount++;\r
  90                     }\r
  91                 }\r
  92                 count += rowCount;\r
  93                 sy += y*rowCount;\r
  94             }\r
  95 \r
  96             if( count == 0 )\r
  97                 break;\r
  98 \r
  99             icount = 1.f/count;\r
 100             int x1 = __float2int_rz(sx*icount);\r
 101             int y1 = __float2int_rz(sy*icount);\r
 102             s0 = __float2int_rz(s0*icount);\r
 103             s1 = __float2int_rz(s1*icount);\r
 104             s2 = __float2int_rz(s2*icount);\r
 105 \r
 106             int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);\r
 107 \r
 108             bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1-x0) + abs(y1-y0) + norm2 <= eps);\r
 109 \r
 110             x0 = x1; y0 = y1;\r
 111             c.x = s0; c.y = s1; c.z = s2;\r
 112 \r
 113             if( stopFlag )\r
 114                 break;\r
 115         }\r
 116 \r
 117         int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);\r
 118         *(uchar4*)(out + base) = c;\r
 119 \r
 120         return make_short2((short)x0, (short)y0);\r
 121     }\r
 122 \r
 123     extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows, \r
 124                                                  int sp, int sr, int maxIter, float eps )\r
 125     {\r
 126         int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
 127         int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
 128 \r
 129         if( x0 < cols && y0 < rows )\r
 130             do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);\r
 131     }\r
 132 \r
 133     extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep, \r
 134                                                  unsigned char* outsp, size_t outspstep, \r
 135                                                  int cols, int rows, \r
 136                                                  int sp, int sr, int maxIter, float eps )\r
 137     {\r
 138         int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
 139         int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
 140 \r
 141         if( x0 < cols && y0 < rows )\r
 142         {            \r
 143             int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);\r
 144             *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);\r
 145         }\r
 146     }\r
 147 \r
 148     extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)\r
 149     {\r
 150         dim3 grid(1, 1, 1);\r
 151         dim3 threads(32, 8, 1);\r
 152         grid.x = divUp(src.cols, threads.x);\r
 153         grid.y = divUp(src.rows, threads.y);\r
 154 \r
 155         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
 156         cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
 157 \r
 158         meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
 159         cudaSafeCall( cudaGetLastError() );\r
 160 \r
 161         cudaSafeCall( cudaDeviceSynchronize() );\r
 162         cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
 163     }\r
 164     extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) \r
 165     {\r
 166         dim3 grid(1, 1, 1);\r
 167         dim3 threads(32, 8, 1);\r
 168         grid.x = divUp(src.cols, threads.x);\r
 169         grid.y = divUp(src.rows, threads.y);\r
 170 \r
 171         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
 172         cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
 173 \r
 174         meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
 175         cudaSafeCall( cudaGetLastError() );\r
 176 \r
 177         cudaSafeCall( cudaDeviceSynchronize() );\r
 178         cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
 179     }\r
 180 \r
 181 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////\r
 182 \r
 183     template <typename T>\r
 184     __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)\r
 185     {        \r
 186         unsigned int H = ((ndisp-d) * 240)/ndisp;\r
 187 \r
 188         unsigned int hi = (H/60) % 6;\r
 189         float f = H/60.f - H/60;\r
 190         float p = V * (1 - S);\r
 191         float q = V * (1 - f * S);\r
 192         float t = V * (1 - (1 - f) * S);\r
 193 \r
 194         float3 res;\r
 195         \r
 196         if (hi == 0) //R = V,   G = t,  B = p\r
 197         {\r
 198             res.x = p;\r
 199             res.y = t;\r
 200             res.z = V;\r
 201         }\r
 202 \r
 203         if (hi == 1) // R = q,  G = V,  B = p\r
 204         {\r
 205             res.x = p;\r
 206             res.y = V;\r
 207             res.z = q;\r
 208         }        \r
 209         \r
 210         if (hi == 2) // R = p,  G = V,  B = t\r
 211         {\r
 212             res.x = t;\r
 213             res.y = V;\r
 214             res.z = p;\r
 215         }\r
 216             \r
 217         if (hi == 3) // R = p,  G = q,  B = V\r
 218         {\r
 219             res.x = V;\r
 220             res.y = q;\r
 221             res.z = p;\r
 222         }\r
 223 \r
 224         if (hi == 4) // R = t,  G = p,  B = V\r
 225         {\r
 226             res.x = V;\r
 227             res.y = p;\r
 228             res.z = t;\r
 229         }\r
 230 \r
 231         if (hi == 5) // R = V,  G = p,  B = q\r
 232         {\r
 233             res.x = q;\r
 234             res.y = p;\r
 235             res.z = V;\r
 236         }\r
 237         const unsigned int b = (unsigned int)(max(0.f, min (res.x, 1.f)) * 255.f);\r
 238         const unsigned int g = (unsigned int)(max(0.f, min (res.y, 1.f)) * 255.f);\r
 239         const unsigned int r = (unsigned int)(max(0.f, min (res.z, 1.f)) * 255.f);\r
 240         const unsigned int a = 255U;\r
 241 \r
 242         return (a << 24) + (r << 16) + (g << 8) + b;    \r
 243     } \r
 244 \r
 245     __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
 246     {\r
 247         const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;\r
 248         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 249 \r
 250         if(x < width && y < height) \r
 251         {\r
 252             uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);\r
 253 \r
 254             uint4 res;\r
 255             res.x = cvtPixel(d4.x, ndisp);\r
 256             res.y = cvtPixel(d4.y, ndisp);\r
 257             res.z = cvtPixel(d4.z, ndisp);\r
 258             res.w = cvtPixel(d4.w, ndisp);\r
 259                     \r
 260             uint4* line = (uint4*)(out_image + y * out_step);\r
 261             line[x >> 2] = res;\r
 262         }\r
 263     }\r
 264 \r
 265     __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
 266     {\r
 267         const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;\r
 268         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 269 \r
 270         if(x < width && y < height) \r
 271         {\r
 272             short2 d2 = *(short2*)(disp + y * disp_step + x);\r
 273 \r
 274             uint2 res;\r
 275             res.x = cvtPixel(d2.x, ndisp);            \r
 276             res.y = cvtPixel(d2.y, ndisp);\r
 277 \r
 278             uint2* line = (uint2*)(out_image + y * out_step);\r
 279             line[x >> 1] = res;\r
 280         }\r
 281     }\r
 282 \r
 283 \r
 284     void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)\r
 285     {\r
 286         dim3 threads(16, 16, 1);\r
 287         dim3 grid(1, 1, 1);\r
 288         grid.x = divUp(src.cols, threads.x << 2);\r
 289         grid.y = divUp(src.rows, threads.y);\r
 290          \r
 291         drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);\r
 292         cudaSafeCall( cudaGetLastError() );\r
 293 \r
 294         if (stream == 0)\r
 295             cudaSafeCall( cudaDeviceSynchronize() ); \r
 296     }\r
 297 \r
 298     void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)\r
 299     {\r
 300         dim3 threads(32, 8, 1);\r
 301         dim3 grid(1, 1, 1);\r
 302         grid.x = divUp(src.cols, threads.x << 1);\r
 303         grid.y = divUp(src.rows, threads.y);\r
 304          \r
 305         drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);\r
 306         cudaSafeCall( cudaGetLastError() );\r
 307         \r
 308         if (stream == 0)\r
 309             cudaSafeCall( cudaDeviceSynchronize() );\r
 310     }\r
 311 \r
 312 /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////\r
 313 \r
 314     __constant__ float cq[16];\r
 315 \r
 316     template <typename T>\r
 317     __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)\r
 318     {        \r
 319         const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 320         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 321 \r
 322         if (y < rows && x < cols)\r
 323         {\r
 324 \r
 325             float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];\r
 326             float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];\r
 327 \r
 328             qx += x * cq[0]; \r
 329             qy += x * cq[4];\r
 330             qz += x * cq[8];\r
 331             qw += x * cq[12];\r
 332 \r
 333             T d = *(disp + disp_step * y + x);\r
 334 \r
 335             float iW = 1.f / (qw + cq[14] * d);\r
 336             float4 v;\r
 337             v.x = (qx + cq[2] * d) * iW;\r
 338             v.y = (qy + cq[6] * d) * iW;\r
 339             v.z = (qz + cq[10] * d) * iW;\r
 340             v.w = 1.f;\r
 341 \r
 342             *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;\r
 343         }\r
 344     }\r
 345 \r
 346     template <typename T>\r
 347     inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
 348     {\r
 349         dim3 threads(32, 8, 1);\r
 350         dim3 grid(1, 1, 1);\r
 351         grid.x = divUp(disp.cols, threads.x);\r
 352         grid.y = divUp(disp.rows, threads.y);\r
 353 \r
 354         cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );\r
 355 \r
 356         reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);\r
 357         cudaSafeCall( cudaGetLastError() );\r
 358 \r
 359         if (stream == 0)\r
 360             cudaSafeCall( cudaDeviceSynchronize() );\r
 361     }\r
 362 \r
 363     void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
 364     {\r
 365         reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
 366     }\r
 367 \r
 368     void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
 369     {\r
 370         reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
 371     }\r
 372 \r
 373 //////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////\r
 374 \r
 375     __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, \r
 376                                           const PtrStepf Dy, PtrStepf dst)\r
 377     {\r
 378         const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 379         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 380 \r
 381         if (x < cols && y < rows)\r
 382         {            \r
 383             float dx = Dx.ptr(y)[x];\r
 384             float dy = Dy.ptr(y)[x];\r
 385 \r
 386             dst.ptr(y)[x] = dx * dx;\r
 387             dst.ptr(y + rows)[x] = dx * dy;\r
 388             dst.ptr(y + (rows << 1))[x] = dy * dy;\r
 389         }\r
 390     }\r
 391 \r
 392     void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)\r
 393     {\r
 394         dim3 threads(32, 8);\r
 395         dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));\r
 396 \r
 397         extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
 398         cudaSafeCall( cudaGetLastError() );\r
 399 \r
 400         cudaSafeCall( cudaDeviceSynchronize() );\r
 401     }\r
 402 \r
 403 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////\r
 404 \r
 405     texture<float, 2> harrisDxTex;\r
 406     texture<float, 2> harrisDyTex;\r
 407 \r
 408     __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
 409                                         PtrStep dst)\r
 410     {\r
 411         const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 412         const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 413 \r
 414         if (x < cols && y < rows)\r
 415         {\r
 416             float a = 0.f;\r
 417             float b = 0.f;\r
 418             float c = 0.f;\r
 419 \r
 420             const int ibegin = y - (block_size / 2);\r
 421             const int jbegin = x - (block_size / 2);\r
 422             const int iend = ibegin + block_size;\r
 423             const int jend = jbegin + block_size;\r
 424 \r
 425             for (int i = ibegin; i < iend; ++i)\r
 426             {\r
 427                 for (int j = jbegin; j < jend; ++j)\r
 428                 {\r
 429                     float dx = tex2D(harrisDxTex, j, i);\r
 430                     float dy = tex2D(harrisDyTex, j, i);\r
 431                     a += dx * dx;\r
 432                     b += dx * dy;\r
 433                     c += dy * dy;\r
 434                 }\r
 435             }\r
 436 \r
 437             ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
 438         }\r
 439     }\r
 440 \r
 441     template <typename BR, typename BC>\r
 442     __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
 443                                         PtrStep dst, BR border_row, BC border_col)\r
 444     {\r
 445         const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 446         const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 447 \r
 448         if (x < cols && y < rows)\r
 449         {\r
 450             float a = 0.f;\r
 451             float b = 0.f;\r
 452             float c = 0.f;\r
 453 \r
 454             const int ibegin = y - (block_size / 2);\r
 455             const int jbegin = x - (block_size / 2);\r
 456             const int iend = ibegin + block_size;\r
 457             const int jend = jbegin + block_size;\r
 458 \r
 459             for (int i = ibegin; i < iend; ++i)\r
 460             {\r
 461                 int y = border_col.idx_row(i);\r
 462                 for (int j = jbegin; j < jend; ++j)\r
 463                 {\r
 464                     int x = border_row.idx_col(j);\r
 465                     float dx = tex2D(harrisDxTex, x, y);\r
 466                     float dy = tex2D(harrisDyTex, x, y);\r
 467                     a += dx * dx;\r
 468                     b += dx * dy;\r
 469                     c += dy * dy;\r
 470                 }\r
 471             }\r
 472 \r
 473             ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
 474         }\r
 475     }\r
 476 \r
 477     void cornerHarris_caller(const int block_size, const float k, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst, \r
 478                              int border_type)\r
 479     {\r
 480         const int rows = Dx.rows;\r
 481         const int cols = Dx.cols;\r
 482 \r
 483         dim3 threads(32, 8);\r
 484         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
 485 \r
 486         cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
 487         cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
 488         cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
 489         harrisDxTex.filterMode = cudaFilterModePoint;\r
 490         harrisDyTex.filterMode = cudaFilterModePoint;\r
 491 \r
 492         switch (border_type) \r
 493         {\r
 494         case BORDER_REFLECT101_GPU:\r
 495             cornerHarris_kernel<<<grid, threads>>>(\r
 496                     cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
 497             break;\r
 498         case BORDER_REPLICATE_GPU:\r
 499             harrisDxTex.addressMode[0] = cudaAddressModeClamp;\r
 500             harrisDxTex.addressMode[1] = cudaAddressModeClamp;\r
 501             harrisDyTex.addressMode[0] = cudaAddressModeClamp;\r
 502             harrisDyTex.addressMode[1] = cudaAddressModeClamp;\r
 503             cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, dst);\r
 504             break;\r
 505         }\r
 506 \r
 507         cudaSafeCall( cudaGetLastError() );\r
 508 \r
 509         cudaSafeCall( cudaDeviceSynchronize() );\r
 510 \r
 511         cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
 512         cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
 513     }\r
 514 \r
 515 /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////\r
 516 \r
 517     texture<float, 2> minEigenValDxTex;\r
 518     texture<float, 2> minEigenValDyTex;\r
 519 \r
 520     __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
 521                                              PtrStep dst)\r
 522     {\r
 523         const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 524         const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 525 \r
 526         if (x < cols && y < rows)\r
 527         {\r
 528             float a = 0.f;\r
 529             float b = 0.f;\r
 530             float c = 0.f;\r
 531 \r
 532             const int ibegin = y - (block_size / 2);\r
 533             const int jbegin = x - (block_size / 2);\r
 534             const int iend = ibegin + block_size;\r
 535             const int jend = jbegin + block_size;\r
 536 \r
 537             for (int i = ibegin; i < iend; ++i)\r
 538             {\r
 539                 for (int j = jbegin; j < jend; ++j)\r
 540                 {\r
 541                     float dx = tex2D(minEigenValDxTex, j, i);\r
 542                     float dy = tex2D(minEigenValDyTex, j, i);\r
 543                     a += dx * dx;\r
 544                     b += dx * dy;\r
 545                     c += dy * dy;\r
 546                 }\r
 547             }\r
 548 \r
 549             a *= 0.5f;\r
 550             c *= 0.5f;\r
 551             ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
 552         }\r
 553     }\r
 554 \r
 555 \r
 556     template <typename BR, typename BC>\r
 557     __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
 558                                              PtrStep dst, BR border_row, BC border_col)\r
 559     {\r
 560         const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 561         const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 562 \r
 563         if (x < cols && y < rows)\r
 564         {\r
 565             float a = 0.f;\r
 566             float b = 0.f;\r
 567             float c = 0.f;\r
 568 \r
 569             const int ibegin = y - (block_size / 2);\r
 570             const int jbegin = x - (block_size / 2);\r
 571             const int iend = ibegin + block_size;\r
 572             const int jend = jbegin + block_size;\r
 573 \r
 574             for (int i = ibegin; i < iend; ++i)\r
 575             {\r
 576                 int y = border_col.idx_row(i);\r
 577                 for (int j = jbegin; j < jend; ++j)\r
 578                 {\r
 579                     int x = border_row.idx_col(j);\r
 580                     float dx = tex2D(minEigenValDxTex, x, y);\r
 581                     float dy = tex2D(minEigenValDyTex, x, y);\r
 582                     a += dx * dx;\r
 583                     b += dx * dy;\r
 584                     c += dy * dy;\r
 585                 }\r
 586             }\r
 587 \r
 588             a *= 0.5f;\r
 589             c *= 0.5f;\r
 590             ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
 591         }\r
 592     }\r
 593 \r
 594     void cornerMinEigenVal_caller(const int block_size, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst,\r
 595                                   int border_type)\r
 596     {\r
 597         const int rows = Dx.rows;\r
 598         const int cols = Dx.cols;\r
 599 \r
 600         dim3 threads(32, 8);\r
 601         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
 602 \r
 603         cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
 604         cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
 605         cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
 606         minEigenValDxTex.filterMode = cudaFilterModePoint;\r
 607         minEigenValDyTex.filterMode = cudaFilterModePoint;\r
 608 \r
 609         switch (border_type)\r
 610         {\r
 611         case BORDER_REFLECT101_GPU:\r
 612             cornerMinEigenVal_kernel<<<grid, threads>>>(\r
 613                     cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
 614             break;\r
 615         case BORDER_REPLICATE_GPU:\r
 616             minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;\r
 617             minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;\r
 618             minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;\r
 619             minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;\r
 620             cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, dst);\r
 621             break;\r
 622         }\r
 623 \r
 624         cudaSafeCall( cudaGetLastError() );\r
 625 \r
 626         cudaSafeCall(cudaDeviceSynchronize());\r
 627 \r
 628         cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
 629         cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
 630     }\r
 631 \r
 632 ////////////////////////////// Column Sum //////////////////////////////////////\r
 633 \r
 634     __global__ void column_sumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)\r
 635     {\r
 636         int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 637 \r
 638         if (x < cols)\r
 639         {\r
 640             const unsigned char* src_data = src.data + x * sizeof(float);\r
 641             unsigned char* dst_data = dst.data + x * sizeof(float);\r
 642 \r
 643             float sum = 0.f;\r
 644             for (int y = 0; y < rows; ++y)\r
 645             {\r
 646                 sum += *(const float*)src_data;\r
 647                 *(float*)dst_data = sum;\r
 648                 src_data += src.step;\r
 649                 dst_data += dst.step;\r
 650             }\r
 651         }\r
 652     }\r
 653 \r
 654 \r
 655     void columnSum_32F(const DevMem2D src, const DevMem2D dst)\r
 656     {\r
 657         dim3 threads(256);\r
 658         dim3 grid(divUp(src.cols, threads.x));\r
 659 \r
 660         column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);\r
 661         cudaSafeCall( cudaGetLastError() );\r
 662 \r
 663         cudaSafeCall( cudaDeviceSynchronize() );\r
 664     }\r
 665 \r
 666 \r
 667     //////////////////////////////////////////////////////////////////////////\r
 668     // mulSpectrums\r
 669 \r
 670     __global__ void mulSpectrumsKernel(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b, \r
 671                                        DevMem2D_<cufftComplex> c)\r
 672     {\r
 673         const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
 674         const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
 675 \r
 676         if (x < c.cols && y < c.rows) \r
 677         {\r
 678             c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
 679         }\r
 680     }\r
 681 \r
 682 \r
 683     void mulSpectrums(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b, \r
 684                       DevMem2D_<cufftComplex> c)\r
 685     {\r
 686         dim3 threads(256);\r
 687         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 688 \r
 689         mulSpectrumsKernel<<<grid, threads>>>(a, b, c);\r
 690         cudaSafeCall( cudaGetLastError() );\r
 691 \r
 692         cudaSafeCall( cudaDeviceSynchronize() );\r
 693     }\r
 694 \r
 695 \r
 696     //////////////////////////////////////////////////////////////////////////\r
 697     // mulSpectrums_CONJ\r
 698 \r
 699     __global__ void mulSpectrumsKernel_CONJ(\r
 700             const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,\r
 701             DevMem2D_<cufftComplex> c)\r
 702     {\r
 703         const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
 704         const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
 705 \r
 706         if (x < c.cols && y < c.rows) \r
 707         {\r
 708             c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
 709         }\r
 710     }\r
 711 \r
 712 \r
 713     void mulSpectrums_CONJ(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b, \r
 714                            DevMem2D_<cufftComplex> c)\r
 715     {\r
 716         dim3 threads(256);\r
 717         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 718 \r
 719         mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);\r
 720         cudaSafeCall( cudaGetLastError() );\r
 721 \r
 722         cudaSafeCall( cudaDeviceSynchronize() );\r
 723     }\r
 724 \r
 725 \r
 726     //////////////////////////////////////////////////////////////////////////\r
 727     // mulAndScaleSpectrums\r
 728 \r
 729     __global__ void mulAndScaleSpectrumsKernel(\r
 730             const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b, \r
 731             float scale, DevMem2D_<cufftComplex> c)\r
 732     {\r
 733         const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 734         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 735 \r
 736         if (x < c.cols && y < c.rows) \r
 737         {\r
 738             cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
 739             c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
 740         }\r
 741     }\r
 742 \r
 743 \r
 744     void mulAndScaleSpectrums(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,\r
 745                               float scale, DevMem2D_<cufftComplex> c)\r
 746     {\r
 747         dim3 threads(256);\r
 748         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 749 \r
 750         mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);\r
 751         cudaSafeCall( cudaGetLastError() );\r
 752 \r
 753         cudaSafeCall( cudaDeviceSynchronize() );\r
 754     }\r
 755 \r
 756 \r
 757     //////////////////////////////////////////////////////////////////////////\r
 758     // mulAndScaleSpectrums_CONJ\r
 759 \r
 760     __global__ void mulAndScaleSpectrumsKernel_CONJ(\r
 761             const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,\r
 762             float scale, DevMem2D_<cufftComplex> c)\r
 763     {\r
 764         const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 765         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 766 \r
 767         if (x < c.cols && y < c.rows) \r
 768         {\r
 769             cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
 770             c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
 771         }\r
 772     }\r
 773 \r
 774 \r
 775     void mulAndScaleSpectrums_CONJ(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,\r
 776                                   float scale, DevMem2D_<cufftComplex> c)\r
 777     {\r
 778         dim3 threads(256);\r
 779         dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
 780 \r
 781         mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);\r
 782         cudaSafeCall( cudaGetLastError() );\r
 783 \r
 784         cudaSafeCall( cudaDeviceSynchronize() );\r
 785     }    \r
 786 \r
 787     //////////////////////////////////////////////////////////////////////////\r
 788     // buildWarpMaps\r
 789 \r
 790     // TODO use intrinsics like __sinf and so on\r
 791 \r
 792     namespace build_warp_maps\r
 793     {\r
 794 \r
 795         __constant__ float ck_rinv[9];\r
 796         __constant__ float cr_kinv[9];\r
 797         __constant__ float ct[3];\r
 798         __constant__ float cscale;\r
 799     }\r
 800 \r
 801 \r
 802     class PlaneMapper\r
 803     {\r
 804     public:\r
 805         static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
 806         {\r
 807             using namespace build_warp_maps;\r
 808 \r
 809             float x_ = u / cscale - ct[0];\r
 810             float y_ = v / cscale - ct[1];\r
 811 \r
 812             float z;\r
 813             x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);\r
 814             y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);\r
 815             z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);\r
 816 \r
 817             x /= z;\r
 818             y /= z;\r
 819         }\r
 820     };\r
 821 \r
 822 \r
 823     class CylindricalMapper\r
 824     {\r
 825     public:\r
 826         static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
 827         {\r
 828             using namespace build_warp_maps;\r
 829 \r
 830             u /= cscale;\r
 831             float x_ = sinf(u);\r
 832             float y_ = v / cscale;\r
 833             float z_ = cosf(u);\r
 834 \r
 835             float z;\r
 836             x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
 837             y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
 838             z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
 839 \r
 840             x /= z;\r
 841             y /= z;\r
 842         }\r
 843     };\r
 844 \r
 845 \r
 846     class SphericalMapper\r
 847     {\r
 848     public:\r
 849         static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
 850         {\r
 851             using namespace build_warp_maps;\r
 852 \r
 853             v /= cscale;\r
 854             u /= cscale;\r
 855 \r
 856             float sinv = sinf(v);\r
 857             float x_ = sinv * sinf(u);\r
 858             float y_ = -cosf(v);\r
 859             float z_ = sinv * cosf(u);\r
 860 \r
 861             float z;\r
 862             x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
 863             y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
 864             z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
 865 \r
 866             x /= z;\r
 867             y /= z;\r
 868         }\r
 869     };\r
 870 \r
 871 \r
 872     template <typename Mapper>\r
 873     __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,\r
 874                                         PtrStepf map_x, PtrStepf map_y)\r
 875     {\r
 876         int du = blockIdx.x * blockDim.x + threadIdx.x;\r
 877         int dv = blockIdx.y * blockDim.y + threadIdx.y;\r
 878         if (du < cols && dv < rows)\r
 879         {\r
 880             float u = tl_u + du;\r
 881             float v = tl_v + dv;\r
 882             float x, y;\r
 883             Mapper::mapBackward(u, v, x, y);\r
 884             map_x.ptr(dv)[du] = x;\r
 885             map_y.ptr(dv)[du] = y;\r
 886         }\r
 887     }\r
 888 \r
 889 \r
 890     void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
 891                             const float k_rinv[9], const float r_kinv[9], const float t[3], \r
 892                             float scale, cudaStream_t stream)\r
 893     {\r
 894         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
 895         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
 896         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));\r
 897         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
 898 \r
 899         int cols = map_x.cols;\r
 900         int rows = map_x.rows;\r
 901 \r
 902         dim3 threads(32, 8);\r
 903         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
 904 \r
 905         buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
 906         cudaSafeCall(cudaGetLastError());\r
 907         if (stream == 0)\r
 908             cudaSafeCall(cudaDeviceSynchronize());\r
 909     }\r
 910 \r
 911 \r
 912     void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
 913                                   const float k_rinv[9], const float r_kinv[9], float scale,\r
 914                                   cudaStream_t stream)\r
 915     {\r
 916         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
 917         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
 918         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
 919 \r
 920         int cols = map_x.cols;\r
 921         int rows = map_x.rows;\r
 922 \r
 923         dim3 threads(32, 8);\r
 924         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
 925 \r
 926         buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
 927         cudaSafeCall(cudaGetLastError());\r
 928         if (stream == 0)\r
 929             cudaSafeCall(cudaDeviceSynchronize());\r
 930     }\r
 931 \r
 932 \r
 933     void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
 934                                 const float k_rinv[9], const float r_kinv[9], float scale,\r
 935                                 cudaStream_t stream)\r
 936     {\r
 937         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
 938         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
 939         cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
 940 \r
 941         int cols = map_x.cols;\r
 942         int rows = map_x.rows;\r
 943 \r
 944         dim3 threads(32, 8);\r
 945         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
 946 \r
 947         buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
 948         cudaSafeCall(cudaGetLastError());\r
 949         if (stream == 0)\r
 950             cudaSafeCall(cudaDeviceSynchronize());\r
 951     }\r
 952 \r
 953 \r
 954     //////////////////////////////////////////////////////////////////////////\r
 955     // convolve\r
 956 \r
 957     #define CONVOLVE_MAX_KERNEL_SIZE 17\r
 958 \r
 959     __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];\r
 960 \r
 961     __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)\r
 962     {\r
 963         __shared__ float smem[16 + 2 * 8][16 + 2 * 8];\r
 964 \r
 965         const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
 966         const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
 967 \r
 968         // x | x 0 | 0\r
 969         // -----------\r
 970         // x | x 0 | 0\r
 971         // 0 | 0 0 | 0\r
 972         // -----------\r
 973         // 0 | 0 0 | 0\r
 974         smem[threadIdx.y][threadIdx.x] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(max(x - 8, 0), src.cols - 1)];\r
 975 \r
 976         // 0 | 0 x | x\r
 977         // -----------\r
 978         // 0 | 0 x | x\r
 979         // 0 | 0 0 | 0\r
 980         // -----------\r
 981         // 0 | 0 0 | 0\r
 982         smem[threadIdx.y][threadIdx.x + 16] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(x + 8, src.cols - 1)];\r
 983 \r
 984         // 0 | 0 0 | 0\r
 985         // -----------\r
 986         // 0 | 0 0 | 0\r
 987         // x | x 0 | 0\r
 988         // -----------\r
 989         // x | x 0 | 0\r
 990         smem[threadIdx.y + 16][threadIdx.x] = src.ptr(min(y + 8, src.rows - 1))[min(max(x - 8, 0), src.cols - 1)];\r
 991 \r
 992         // 0 | 0 0 | 0\r
 993         // -----------\r
 994         // 0 | 0 0 | 0\r
 995         // 0 | 0 x | x\r
 996         // -----------\r
 997         // 0 | 0 x | x\r
 998         smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(min(y + 8, src.rows - 1))[min(x + 8, src.cols - 1)];\r
 999 \r
1000         __syncthreads();\r
1001 \r
1002         if (x < src.cols && y < src.rows)\r
1003         {\r
1004             float res = 0;\r
1005 \r
1006             for (int i = 0; i < kHeight; ++i)\r
1007             {\r
1008                 for (int j = 0; j < kWidth; ++j)\r
1009                 {\r
1010                     res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];\r
1011                 }\r
1012             }\r
1013 \r
1014             dst.ptr(y)[x] = res;\r
1015         }\r
1016     }\r
1017 \r
1018     void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel)\r
1019     {\r
1020         cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
1021 \r
1022         const dim3 block(16, 16);\r
1023         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
1024 \r
1025         convolve<<<grid, block>>>(src, dst, kWidth, kHeight);\r
1026         cudaSafeCall(cudaGetLastError());\r
1027 \r
1028         cudaSafeCall(cudaDeviceSynchronize());\r
1029     }\r
1030 \r
1031 \r
1032 }}}\r
1033 \r
1034 \r