1 /*M///////////////////////////////////////////////////////////////////////////////////////
\r
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
\r
5 // By downloading, copying, installing or using the software you agree to this license.
\r
6 // If you do not agree to this license, do not download, install,
\r
7 // copy or use the software.
\r
10 // License Agreement
\r
11 // For Open Source Computer Vision Library
\r
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
\r
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
\r
15 // Third party copyrights are property of their respective owners.
\r
17 // Redistribution and use in source and binary forms, with or without modification,
\r
18 // are permitted provided that the following conditions are met:
\r
20 // * Redistribution's of source code must retain the above copyright notice,
\r
21 // this list of conditions and the following disclaimer.
\r
23 // * Redistribution's in binary form must reproduce the above copyright notice,
\r
24 // this list of conditions and the following disclaimer in the documentation
\r
25 // and/or other materials provided with the distribution.
\r
27 // * The name of the copyright holders may not be used to endorse or promote products
\r
28 // derived from this software without specific prior written permission.
\r
30 // This software is provided by the copyright holders and contributors "as is" and
\r
31 // any express or implied warranties, including, but not limited to, the implied
\r
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
\r
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
\r
34 // indirect, incidental, special, exemplary, or consequential damages
\r
35 // (including, but not limited to, procurement of substitute goods or services;
\r
36 // loss of use, data, or profits; or business interruption) however caused
\r
37 // and on any theory of liability, whether in contract, strict liability,
\r
38 // or tort (including negligence or otherwise) arising in any way out of
\r
39 // the use of this software, even if advised of the possibility of such damage.
\r
43 #include "internal_shared.hpp"
\r
44 #include "opencv2/gpu/device/vec_traits.hpp"
\r
45 #include "opencv2/gpu/device/vec_math.hpp"
\r
46 #include "opencv2/gpu/device/saturate_cast.hpp"
\r
47 #include "opencv2/gpu/device/border_interpolate.hpp"
\r
49 using namespace cv::gpu;
\r
50 using namespace cv::gpu::device;
\r
52 namespace cv { namespace gpu { namespace imgproc
\r
54 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
\r
56 texture<uchar4, 2> tex_meanshift;
\r
58 __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
\r
59 size_t out_step, int cols, int rows,
\r
60 int sp, int sr, int maxIter, float eps)
\r
63 uchar4 c = tex2D(tex_meanshift, x0, y0 );
\r
65 // iterate meanshift procedure
\r
66 for( int iter = 0; iter < maxIter; iter++ )
\r
69 int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
\r
72 //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
\r
78 for( int y = miny; y <= maxy; y++)
\r
81 for( int x = minx; x <= maxx; x++ )
\r
83 uchar4 t = tex2D( tex_meanshift, x, y );
\r
85 int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
\r
88 s0 += t.x; s1 += t.y; s2 += t.z;
\r
89 sx += x; rowCount++;
\r
100 int x1 = __float2int_rz(sx*icount);
\r
101 int y1 = __float2int_rz(sy*icount);
\r
102 s0 = __float2int_rz(s0*icount);
\r
103 s1 = __float2int_rz(s1*icount);
\r
104 s2 = __float2int_rz(s2*icount);
\r
106 int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
\r
108 bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1-x0) + abs(y1-y0) + norm2 <= eps);
\r
111 c.x = s0; c.y = s1; c.z = s2;
\r
117 int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
\r
118 *(uchar4*)(out + base) = c;
\r
120 return make_short2((short)x0, (short)y0);
\r
123 extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows,
\r
124 int sp, int sr, int maxIter, float eps )
\r
126 int x0 = blockIdx.x * blockDim.x + threadIdx.x;
\r
127 int y0 = blockIdx.y * blockDim.y + threadIdx.y;
\r
129 if( x0 < cols && y0 < rows )
\r
130 do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
\r
133 extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep,
\r
134 unsigned char* outsp, size_t outspstep,
\r
135 int cols, int rows,
\r
136 int sp, int sr, int maxIter, float eps )
\r
138 int x0 = blockIdx.x * blockDim.x + threadIdx.x;
\r
139 int y0 = blockIdx.y * blockDim.y + threadIdx.y;
\r
141 if( x0 < cols && y0 < rows )
\r
143 int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
\r
144 *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
\r
148 extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)
\r
150 dim3 grid(1, 1, 1);
\r
151 dim3 threads(32, 8, 1);
\r
152 grid.x = divUp(src.cols, threads.x);
\r
153 grid.y = divUp(src.rows, threads.y);
\r
155 cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
\r
156 cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
\r
158 meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
\r
159 cudaSafeCall( cudaGetLastError() );
\r
161 cudaSafeCall( cudaDeviceSynchronize() );
\r
162 cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
\r
164 extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
\r
166 dim3 grid(1, 1, 1);
\r
167 dim3 threads(32, 8, 1);
\r
168 grid.x = divUp(src.cols, threads.x);
\r
169 grid.y = divUp(src.rows, threads.y);
\r
171 cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
\r
172 cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
\r
174 meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
\r
175 cudaSafeCall( cudaGetLastError() );
\r
177 cudaSafeCall( cudaDeviceSynchronize() );
\r
178 cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
\r
181 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
\r
183 template <typename T>
\r
184 __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
\r
186 unsigned int H = ((ndisp-d) * 240)/ndisp;
\r
188 unsigned int hi = (H/60) % 6;
\r
189 float f = H/60.f - H/60;
\r
190 float p = V * (1 - S);
\r
191 float q = V * (1 - f * S);
\r
192 float t = V * (1 - (1 - f) * S);
\r
196 if (hi == 0) //R = V, G = t, B = p
\r
203 if (hi == 1) // R = q, G = V, B = p
\r
210 if (hi == 2) // R = p, G = V, B = t
\r
217 if (hi == 3) // R = p, G = q, B = V
\r
224 if (hi == 4) // R = t, G = p, B = V
\r
231 if (hi == 5) // R = V, G = p, B = q
\r
237 const unsigned int b = (unsigned int)(max(0.f, min (res.x, 1.f)) * 255.f);
\r
238 const unsigned int g = (unsigned int)(max(0.f, min (res.y, 1.f)) * 255.f);
\r
239 const unsigned int r = (unsigned int)(max(0.f, min (res.z, 1.f)) * 255.f);
\r
240 const unsigned int a = 255U;
\r
242 return (a << 24) + (r << 16) + (g << 8) + b;
\r
245 __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
\r
247 const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
\r
248 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
250 if(x < width && y < height)
\r
252 uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
\r
255 res.x = cvtPixel(d4.x, ndisp);
\r
256 res.y = cvtPixel(d4.y, ndisp);
\r
257 res.z = cvtPixel(d4.z, ndisp);
\r
258 res.w = cvtPixel(d4.w, ndisp);
\r
260 uint4* line = (uint4*)(out_image + y * out_step);
\r
261 line[x >> 2] = res;
\r
265 __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
\r
267 const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
\r
268 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
270 if(x < width && y < height)
\r
272 short2 d2 = *(short2*)(disp + y * disp_step + x);
\r
275 res.x = cvtPixel(d2.x, ndisp);
\r
276 res.y = cvtPixel(d2.y, ndisp);
\r
278 uint2* line = (uint2*)(out_image + y * out_step);
\r
279 line[x >> 1] = res;
\r
284 void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
\r
286 dim3 threads(16, 16, 1);
\r
287 dim3 grid(1, 1, 1);
\r
288 grid.x = divUp(src.cols, threads.x << 2);
\r
289 grid.y = divUp(src.rows, threads.y);
\r
291 drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
\r
292 cudaSafeCall( cudaGetLastError() );
\r
295 cudaSafeCall( cudaDeviceSynchronize() );
\r
298 void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
\r
300 dim3 threads(32, 8, 1);
\r
301 dim3 grid(1, 1, 1);
\r
302 grid.x = divUp(src.cols, threads.x << 1);
\r
303 grid.y = divUp(src.rows, threads.y);
\r
305 drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
\r
306 cudaSafeCall( cudaGetLastError() );
\r
309 cudaSafeCall( cudaDeviceSynchronize() );
\r
312 /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
\r
314 __constant__ float cq[16];
\r
316 template <typename T>
\r
317 __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)
\r
319 const int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
320 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
322 if (y < rows && x < cols)
\r
325 float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];
\r
326 float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];
\r
333 T d = *(disp + disp_step * y + x);
\r
335 float iW = 1.f / (qw + cq[14] * d);
\r
337 v.x = (qx + cq[2] * d) * iW;
\r
338 v.y = (qy + cq[6] * d) * iW;
\r
339 v.z = (qz + cq[10] * d) * iW;
\r
342 *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
\r
346 template <typename T>
\r
347 inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
\r
349 dim3 threads(32, 8, 1);
\r
350 dim3 grid(1, 1, 1);
\r
351 grid.x = divUp(disp.cols, threads.x);
\r
352 grid.y = divUp(disp.rows, threads.y);
\r
354 cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
\r
356 reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
\r
357 cudaSafeCall( cudaGetLastError() );
\r
360 cudaSafeCall( cudaDeviceSynchronize() );
\r
363 void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
\r
365 reprojectImageTo3D_caller(disp, xyzw, q, stream);
\r
368 void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
\r
370 reprojectImageTo3D_caller(disp, xyzw, q, stream);
\r
373 //////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////
\r
375 __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx,
\r
376 const PtrStepf Dy, PtrStepf dst)
\r
378 const int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
379 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
381 if (x < cols && y < rows)
\r
383 float dx = Dx.ptr(y)[x];
\r
384 float dy = Dy.ptr(y)[x];
\r
386 dst.ptr(y)[x] = dx * dx;
\r
387 dst.ptr(y + rows)[x] = dx * dy;
\r
388 dst.ptr(y + (rows << 1))[x] = dy * dy;
\r
392 void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)
\r
394 dim3 threads(32, 8);
\r
395 dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
\r
397 extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
\r
398 cudaSafeCall( cudaGetLastError() );
\r
400 cudaSafeCall( cudaDeviceSynchronize() );
\r
403 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
\r
405 texture<float, 2> harrisDxTex;
\r
406 texture<float, 2> harrisDyTex;
\r
408 __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
\r
411 const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
412 const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
414 if (x < cols && y < rows)
\r
420 const int ibegin = y - (block_size / 2);
\r
421 const int jbegin = x - (block_size / 2);
\r
422 const int iend = ibegin + block_size;
\r
423 const int jend = jbegin + block_size;
\r
425 for (int i = ibegin; i < iend; ++i)
\r
427 for (int j = jbegin; j < jend; ++j)
\r
429 float dx = tex2D(harrisDxTex, j, i);
\r
430 float dy = tex2D(harrisDyTex, j, i);
\r
437 ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
\r
441 template <typename BR, typename BC>
\r
442 __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
\r
443 PtrStep dst, BR border_row, BC border_col)
\r
445 const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
446 const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
448 if (x < cols && y < rows)
\r
454 const int ibegin = y - (block_size / 2);
\r
455 const int jbegin = x - (block_size / 2);
\r
456 const int iend = ibegin + block_size;
\r
457 const int jend = jbegin + block_size;
\r
459 for (int i = ibegin; i < iend; ++i)
\r
461 int y = border_col.idx_row(i);
\r
462 for (int j = jbegin; j < jend; ++j)
\r
464 int x = border_row.idx_col(j);
\r
465 float dx = tex2D(harrisDxTex, x, y);
\r
466 float dy = tex2D(harrisDyTex, x, y);
\r
473 ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
\r
477 void cornerHarris_caller(const int block_size, const float k, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst,
\r
480 const int rows = Dx.rows;
\r
481 const int cols = Dx.cols;
\r
483 dim3 threads(32, 8);
\r
484 dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
\r
486 cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
\r
487 cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);
\r
488 cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);
\r
489 harrisDxTex.filterMode = cudaFilterModePoint;
\r
490 harrisDyTex.filterMode = cudaFilterModePoint;
\r
492 switch (border_type)
\r
494 case BORDER_REFLECT101_GPU:
\r
495 cornerHarris_kernel<<<grid, threads>>>(
\r
496 cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
\r
498 case BORDER_REPLICATE_GPU:
\r
499 harrisDxTex.addressMode[0] = cudaAddressModeClamp;
\r
500 harrisDxTex.addressMode[1] = cudaAddressModeClamp;
\r
501 harrisDyTex.addressMode[0] = cudaAddressModeClamp;
\r
502 harrisDyTex.addressMode[1] = cudaAddressModeClamp;
\r
503 cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, dst);
\r
507 cudaSafeCall( cudaGetLastError() );
\r
509 cudaSafeCall( cudaDeviceSynchronize() );
\r
511 cudaSafeCall(cudaUnbindTexture(harrisDxTex));
\r
512 cudaSafeCall(cudaUnbindTexture(harrisDyTex));
\r
515 /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
\r
517 texture<float, 2> minEigenValDxTex;
\r
518 texture<float, 2> minEigenValDyTex;
\r
520 __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size,
\r
523 const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
524 const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
526 if (x < cols && y < rows)
\r
532 const int ibegin = y - (block_size / 2);
\r
533 const int jbegin = x - (block_size / 2);
\r
534 const int iend = ibegin + block_size;
\r
535 const int jend = jbegin + block_size;
\r
537 for (int i = ibegin; i < iend; ++i)
\r
539 for (int j = jbegin; j < jend; ++j)
\r
541 float dx = tex2D(minEigenValDxTex, j, i);
\r
542 float dy = tex2D(minEigenValDyTex, j, i);
\r
551 ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
\r
556 template <typename BR, typename BC>
\r
557 __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size,
\r
558 PtrStep dst, BR border_row, BC border_col)
\r
560 const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
561 const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
563 if (x < cols && y < rows)
\r
569 const int ibegin = y - (block_size / 2);
\r
570 const int jbegin = x - (block_size / 2);
\r
571 const int iend = ibegin + block_size;
\r
572 const int jend = jbegin + block_size;
\r
574 for (int i = ibegin; i < iend; ++i)
\r
576 int y = border_col.idx_row(i);
\r
577 for (int j = jbegin; j < jend; ++j)
\r
579 int x = border_row.idx_col(j);
\r
580 float dx = tex2D(minEigenValDxTex, x, y);
\r
581 float dy = tex2D(minEigenValDyTex, x, y);
\r
590 ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
\r
594 void cornerMinEigenVal_caller(const int block_size, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst,
\r
597 const int rows = Dx.rows;
\r
598 const int cols = Dx.cols;
\r
600 dim3 threads(32, 8);
\r
601 dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
\r
603 cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
\r
604 cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);
\r
605 cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);
\r
606 minEigenValDxTex.filterMode = cudaFilterModePoint;
\r
607 minEigenValDyTex.filterMode = cudaFilterModePoint;
\r
609 switch (border_type)
\r
611 case BORDER_REFLECT101_GPU:
\r
612 cornerMinEigenVal_kernel<<<grid, threads>>>(
\r
613 cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
\r
615 case BORDER_REPLICATE_GPU:
\r
616 minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;
\r
617 minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
\r
618 minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;
\r
619 minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;
\r
620 cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, dst);
\r
624 cudaSafeCall( cudaGetLastError() );
\r
626 cudaSafeCall(cudaDeviceSynchronize());
\r
628 cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
\r
629 cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
\r
632 ////////////////////////////// Column Sum //////////////////////////////////////
\r
634 __global__ void column_sumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
\r
636 int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
640 const unsigned char* src_data = src.data + x * sizeof(float);
\r
641 unsigned char* dst_data = dst.data + x * sizeof(float);
\r
644 for (int y = 0; y < rows; ++y)
\r
646 sum += *(const float*)src_data;
\r
647 *(float*)dst_data = sum;
\r
648 src_data += src.step;
\r
649 dst_data += dst.step;
\r
655 void columnSum_32F(const DevMem2D src, const DevMem2D dst)
\r
658 dim3 grid(divUp(src.cols, threads.x));
\r
660 column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
\r
661 cudaSafeCall( cudaGetLastError() );
\r
663 cudaSafeCall( cudaDeviceSynchronize() );
\r
667 //////////////////////////////////////////////////////////////////////////
\r
670 __global__ void mulSpectrumsKernel(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
671 DevMem2D_<cufftComplex> c)
\r
673 const int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
674 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
676 if (x < c.cols && y < c.rows)
\r
678 c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
\r
683 void mulSpectrums(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
684 DevMem2D_<cufftComplex> c)
\r
687 dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
\r
689 mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
\r
690 cudaSafeCall( cudaGetLastError() );
\r
692 cudaSafeCall( cudaDeviceSynchronize() );
\r
696 //////////////////////////////////////////////////////////////////////////
\r
697 // mulSpectrums_CONJ
\r
699 __global__ void mulSpectrumsKernel_CONJ(
\r
700 const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
701 DevMem2D_<cufftComplex> c)
\r
703 const int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
704 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
706 if (x < c.cols && y < c.rows)
\r
708 c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
\r
713 void mulSpectrums_CONJ(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
714 DevMem2D_<cufftComplex> c)
\r
717 dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
\r
719 mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
\r
720 cudaSafeCall( cudaGetLastError() );
\r
722 cudaSafeCall( cudaDeviceSynchronize() );
\r
726 //////////////////////////////////////////////////////////////////////////
\r
727 // mulAndScaleSpectrums
\r
729 __global__ void mulAndScaleSpectrumsKernel(
\r
730 const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
731 float scale, DevMem2D_<cufftComplex> c)
\r
733 const int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
734 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
736 if (x < c.cols && y < c.rows)
\r
738 cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
\r
739 c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
\r
744 void mulAndScaleSpectrums(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
745 float scale, DevMem2D_<cufftComplex> c)
\r
748 dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
\r
750 mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
\r
751 cudaSafeCall( cudaGetLastError() );
\r
753 cudaSafeCall( cudaDeviceSynchronize() );
\r
757 //////////////////////////////////////////////////////////////////////////
\r
758 // mulAndScaleSpectrums_CONJ
\r
760 __global__ void mulAndScaleSpectrumsKernel_CONJ(
\r
761 const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
762 float scale, DevMem2D_<cufftComplex> c)
\r
764 const int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
765 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
767 if (x < c.cols && y < c.rows)
\r
769 cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
\r
770 c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
\r
775 void mulAndScaleSpectrums_CONJ(const PtrStep_<cufftComplex> a, const PtrStep_<cufftComplex> b,
\r
776 float scale, DevMem2D_<cufftComplex> c)
\r
779 dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
\r
781 mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
\r
782 cudaSafeCall( cudaGetLastError() );
\r
784 cudaSafeCall( cudaDeviceSynchronize() );
\r
787 //////////////////////////////////////////////////////////////////////////
\r
790 // TODO use intrinsics like __sinf and so on
\r
792 namespace build_warp_maps
\r
795 __constant__ float ck_rinv[9];
\r
796 __constant__ float cr_kinv[9];
\r
797 __constant__ float ct[3];
\r
798 __constant__ float cscale;
\r
805 static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
\r
807 using namespace build_warp_maps;
\r
809 float x_ = u / cscale - ct[0];
\r
810 float y_ = v / cscale - ct[1];
\r
813 x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
\r
814 y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
\r
815 z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
\r
823 class CylindricalMapper
\r
826 static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
\r
828 using namespace build_warp_maps;
\r
831 float x_ = sinf(u);
\r
832 float y_ = v / cscale;
\r
833 float z_ = cosf(u);
\r
836 x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
\r
837 y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
\r
838 z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
\r
846 class SphericalMapper
\r
849 static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
\r
851 using namespace build_warp_maps;
\r
856 float sinv = sinf(v);
\r
857 float x_ = sinv * sinf(u);
\r
858 float y_ = -cosf(v);
\r
859 float z_ = sinv * cosf(u);
\r
862 x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
\r
863 y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
\r
864 z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
\r
872 template <typename Mapper>
\r
873 __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
\r
874 PtrStepf map_x, PtrStepf map_y)
\r
876 int du = blockIdx.x * blockDim.x + threadIdx.x;
\r
877 int dv = blockIdx.y * blockDim.y + threadIdx.y;
\r
878 if (du < cols && dv < rows)
\r
880 float u = tl_u + du;
\r
881 float v = tl_v + dv;
\r
883 Mapper::mapBackward(u, v, x, y);
\r
884 map_x.ptr(dv)[du] = x;
\r
885 map_y.ptr(dv)[du] = y;
\r
890 void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
\r
891 const float k_rinv[9], const float r_kinv[9], const float t[3],
\r
892 float scale, cudaStream_t stream)
\r
894 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
\r
895 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
\r
896 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
\r
897 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
\r
899 int cols = map_x.cols;
\r
900 int rows = map_x.rows;
\r
902 dim3 threads(32, 8);
\r
903 dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
\r
905 buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
\r
906 cudaSafeCall(cudaGetLastError());
\r
908 cudaSafeCall(cudaDeviceSynchronize());
\r
912 void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
\r
913 const float k_rinv[9], const float r_kinv[9], float scale,
\r
914 cudaStream_t stream)
\r
916 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
\r
917 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
\r
918 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
\r
920 int cols = map_x.cols;
\r
921 int rows = map_x.rows;
\r
923 dim3 threads(32, 8);
\r
924 dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
\r
926 buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
\r
927 cudaSafeCall(cudaGetLastError());
\r
929 cudaSafeCall(cudaDeviceSynchronize());
\r
933 void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
\r
934 const float k_rinv[9], const float r_kinv[9], float scale,
\r
935 cudaStream_t stream)
\r
937 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
\r
938 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
\r
939 cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
\r
941 int cols = map_x.cols;
\r
942 int rows = map_x.rows;
\r
944 dim3 threads(32, 8);
\r
945 dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
\r
947 buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
\r
948 cudaSafeCall(cudaGetLastError());
\r
950 cudaSafeCall(cudaDeviceSynchronize());
\r
954 //////////////////////////////////////////////////////////////////////////
\r
957 #define CONVOLVE_MAX_KERNEL_SIZE 17
\r
959 __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];
\r
961 __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)
\r
963 __shared__ float smem[16 + 2 * 8][16 + 2 * 8];
\r
965 const int x = blockIdx.x * blockDim.x + threadIdx.x;
\r
966 const int y = blockIdx.y * blockDim.y + threadIdx.y;
\r
974 smem[threadIdx.y][threadIdx.x] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(max(x - 8, 0), src.cols - 1)];
\r
982 smem[threadIdx.y][threadIdx.x + 16] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(x + 8, src.cols - 1)];
\r
990 smem[threadIdx.y + 16][threadIdx.x] = src.ptr(min(y + 8, src.rows - 1))[min(max(x - 8, 0), src.cols - 1)];
\r
998 smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(min(y + 8, src.rows - 1))[min(x + 8, src.cols - 1)];
\r
1002 if (x < src.cols && y < src.rows)
\r
1006 for (int i = 0; i < kHeight; ++i)
\r
1008 for (int j = 0; j < kWidth; ++j)
\r
1010 res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];
\r
1014 dst.ptr(y)[x] = res;
\r
1018 void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel)
\r
1020 cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
\r
1022 const dim3 block(16, 16);
\r
1023 const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
\r
1025 convolve<<<grid, block>>>(src, dst, kWidth, kHeight);
\r
1026 cudaSafeCall(cudaGetLastError());
\r
1028 cudaSafeCall(cudaDeviceSynchronize());
\r