From d661b8e3e2add0623bc49f0cf50492646f2e2879 Mon Sep 17 00:00:00 2001 From: Anatoly Baksheev Date: Sun, 31 Oct 2010 13:23:25 +0000 Subject: [PATCH] added PtrStep PtrElemStep structures. Refactored name spaces, --- modules/gpu/include/opencv2/gpu/devmem2d.hpp | 89 +++++---- modules/gpu/include/opencv2/gpu/gpu.hpp | 1 + .../gpu/include/opencv2/gpu/matrix_operations.hpp | 1 + modules/gpu/src/cuda/beliefpropagation.cu | 61 ++---- modules/gpu/src/cuda/color.cu | 160 ++++++---------- modules/gpu/src/cuda/constantspacebp.cu | 126 +++++-------- modules/gpu/src/cuda/cuda_shared.hpp | 4 +- modules/gpu/src/cuda/filters.cu | 24 +-- modules/gpu/src/cuda/imgproc.cu | 57 ++---- modules/gpu/src/cuda/mathfunc.cu | 49 ++--- modules/gpu/src/cuda/matrix_operations.cu | 74 ++------ modules/gpu/src/cuda/split_merge.cu | 48 ++--- modules/gpu/src/cuda/stereobm.cu | 205 +++++++++------------ modules/gpu/src/cuda/transform.hpp | 34 ++-- modules/gpu/src/cuda/vecmath.hpp | 9 +- modules/gpu/src/filtering.cpp | 8 +- modules/gpu/src/stereobm_gpu.cpp | 8 +- 17 files changed, 384 insertions(+), 574 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/devmem2d.hpp b/modules/gpu/include/opencv2/gpu/devmem2d.hpp index f3293c0..4ab8e33 100644 --- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp +++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp @@ -50,56 +50,79 @@ namespace cv // Simple lightweight structures that encapsulates information about an image on device. // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile - template struct PtrStep_ - { - T* ptr; - size_t step; - - typedef T elem_type; - enum { elem_size = sizeof(elem_type) }; - #if defined(__CUDACC__) - __host__ __device__ + #define __CV_GPU_HOST_DEVICE__ __host__ __device__ +#else + #define __CV_GPU_HOST_DEVICE__ #endif - size_t elemSize() const { return elem_size; } - }; template struct DevMem2D_ { int cols; int rows; - T* ptr; + T* data; size_t step; - size_t elem_step; - - /*__host__*/ - DevMem2D_() : cols(0), rows(0), ptr(0), step(0), elem_step(0) {} - - /*__host__*/ - DevMem2D_(int rows_, int cols_, T *ptr_, size_t step_) - : cols(cols_), rows(rows_), ptr(ptr_), step(step_), elem_step(step_ / sizeof(T)) {} - template - /*__host__*/ + DevMem2D_() : cols(0), rows(0), data(0), step(0) {} + + DevMem2D_(int rows_, int cols_, T *data_, size_t step_) + : cols(cols_), rows(rows_), data(data_), step(step_) {} + + template explicit DevMem2D_(const DevMem2D_& d) - : cols(d.cols), rows(d.rows), ptr((T*)d.ptr), step(d.step), elem_step(d.step / sizeof(T)) {} + : cols(d.cols), rows(d.rows), data((T*)d.data), step(d.step) {} + + typedef T elem_type; + enum { elem_size = sizeof(elem_type) }; - template - /*__host__*/ - operator PtrStep_() const { PtrStep_ dt; dt.ptr = ptr; dt.step = step; return dt; } + __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; } + __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step ); } + __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step ); } + }; + + template struct PtrStep_ + { + T* data; + size_t step; - typedef typename PtrStep_::elem_type elem_type; - enum { elem_size = PtrStep_::elem_size }; -#if defined(__CUDACC__) - __host__ __device__ -#endif - size_t elemSize() const { return elem_size; } + PtrStep_() : data(0), step(0) {} + PtrStep_(const DevMem2D_& mem) : data(mem.data), step(mem.step) {} + + typedef T elem_type; + enum { elem_size = sizeof(elem_type) }; + + __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; } + __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step); } + __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step); } + }; + + template struct PtrElemStep_ : public PtrStep_ + { + PtrElemStep_(const DevMem2D_& mem) : PtrStep_(mem) + { + step /= elem_size; + } + private: + template struct StaticCheck; + template <> struct StaticCheck{}; + + StaticCheck<256 % sizeof(T) == 0> ElemStepTypeCheck; }; typedef DevMem2D_ DevMem2D; typedef DevMem2D_ DevMem2Df; typedef DevMem2D_ DevMem2Di; - } + + typedef PtrStep_ PtrStep; + typedef PtrStep_ PtrStepf; + typedef PtrStep_ PtrStepi; + + typedef PtrElemStep_ PtrElemStep; + typedef PtrElemStep_ PtrElemStepf; + typedef PtrElemStep_ PtrElemStepi; + +#undef __CV_GPU_HOST_DEVICE__ + } } #endif /* __OPENCV_GPU_DEVMEM2D_HPP__ */ diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 6640290..2a745c0 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -109,6 +109,7 @@ namespace cv //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code. // Contains just image size, data ptr and step. template operator DevMem2D_() const; + template operator PtrStep_() const; //! pefroms blocking upload data to GpuMat. . void upload(const cv::Mat& m); diff --git a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp index 0865807..e576d00 100644 --- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp +++ b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp @@ -207,6 +207,7 @@ inline GpuMat& GpuMat::operator = (const GpuMat& m) inline GpuMat& GpuMat::operator = (const Mat& m) { upload(m); return *this; } template inline GpuMat::operator DevMem2D_() const { return DevMem2D_(rows, cols, (T*)data, step); } +template inline GpuMat::operator PtrStep_() const { return PtrStep_(*this); } //CPP: void GpuMat::upload(const Mat& m); diff --git a/modules/gpu/src/cuda/beliefpropagation.cu b/modules/gpu/src/cuda/beliefpropagation.cu index 063a9ad..5ec9e87 100644 --- a/modules/gpu/src/cuda/beliefpropagation.cu +++ b/modules/gpu/src/cuda/beliefpropagation.cu @@ -50,36 +50,32 @@ using namespace cv::gpu; #define FLT_MAX 3.402823466e+38F #endif +namespace cv { namespace gpu { namespace bp { + /////////////////////////////////////////////////////////////// /////////////////////// load constants //////////////////////// /////////////////////////////////////////////////////////////// -namespace bp_kernels -{ __constant__ int cndisp; __constant__ float cmax_data_term; __constant__ float cdata_weight; __constant__ float cmax_disc_term; __constant__ float cdisc_single_jump; -}; -namespace cv { namespace gpu { namespace bp { void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump) { - cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cndisp, &ndisp, sizeof(int )) ); - cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cmax_data_term, &max_data_term, sizeof(float)) ); - cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cdata_weight, &data_weight, sizeof(float)) ); - cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cmax_disc_term, &max_disc_term, sizeof(float)) ); - cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cdisc_single_jump, &disc_single_jump, sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) ); + cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) ); } -}}} /////////////////////////////////////////////////////////////// ////////////////////////// comp data ////////////////////////// /////////////////////////////////////////////////////////////// -namespace bp_kernels -{ + template __global__ void comp_data_gray(const uchar* l, const uchar* r, size_t step, T* data, size_t data_step, int cols, int rows) { @@ -145,9 +141,7 @@ namespace bp_kernels } } } -} -namespace cv { namespace gpu { namespace bp { typedef void (*CompDataFunc)(const DevMem2D& l, const DevMem2D& r, int channels, DevMem2D mdata, const cudaStream_t& stream); template @@ -160,9 +154,9 @@ namespace cv { namespace gpu { namespace bp { grid.y = divUp(l.rows, threads.y); if (channels == 1) - bp_kernels::comp_data_gray<<>>(l.ptr, r.ptr, l.step, (T*)mdata.ptr, mdata.step/sizeof(T), l.cols, l.rows); + comp_data_gray<<>>(l.data, r.data, l.step, (T*)mdata.data, mdata.step/sizeof(T), l.cols, l.rows); else - bp_kernels::comp_data_bgr<<>>(l.ptr, r.ptr, l.step, (T*)mdata.ptr, mdata.step/sizeof(T), l.cols, l.rows); + comp_data_bgr<<>>(l.data, r.data, l.step, (T*)mdata.data, mdata.step/sizeof(T), l.cols, l.rows); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -187,14 +181,11 @@ namespace cv { namespace gpu { namespace bp { cv::gpu::error("Unsupported message type", __FILE__, __LINE__); func(l, r, channels, mdata, stream); } -}}} /////////////////////////////////////////////////////////////// //////////////////////// data step down /////////////////////// /////////////////////////////////////////////////////////////// -namespace bp_kernels -{ template __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step) { @@ -217,9 +208,7 @@ namespace bp_kernels } } } -} -namespace cv { namespace gpu { namespace bp { typedef void (*DataStepDownFunc)(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, DevMem2D dst, const cudaStream_t& stream); template @@ -231,7 +220,7 @@ namespace cv { namespace gpu { namespace bp { grid.x = divUp(dst_cols, threads.x); grid.y = divUp(dst_rows, threads.y); - bp_kernels::data_step_down<<>>(dst_cols, dst_rows, src_rows, (const T*)src.ptr, src.step/sizeof(T), (T*)dst.ptr, dst.step/sizeof(T)); + data_step_down<<>>(dst_cols, dst_rows, src_rows, (const T*)src.data, src.step/sizeof(T), (T*)dst.data, dst.step/sizeof(T)); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -256,14 +245,11 @@ namespace cv { namespace gpu { namespace bp { cv::gpu::error("Unsupported message type", __FILE__, __LINE__); func(dst_cols, dst_rows, src_rows, src, dst, stream); } -}}} /////////////////////////////////////////////////////////////// /////////////////// level up messages //////////////////////// /////////////////////////////////////////////////////////////// -namespace bp_kernels -{ template __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step) { @@ -282,9 +268,7 @@ namespace bp_kernels dstr[d * dst_disp_step] = srcr[d * src_disp_step]; } } -} -namespace cv { namespace gpu { namespace bp { typedef void (*LevelUpMessagesFunc)(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, const cudaStream_t& stream); template @@ -298,10 +282,10 @@ namespace cv { namespace gpu { namespace bp { int src_idx = (dst_idx + 1) & 1; - bp_kernels::level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mus[src_idx].ptr, mus[src_idx].step/sizeof(T), (T*)mus[dst_idx].ptr, mus[dst_idx].step/sizeof(T)); - bp_kernels::level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mds[src_idx].ptr, mds[src_idx].step/sizeof(T), (T*)mds[dst_idx].ptr, mds[dst_idx].step/sizeof(T)); - bp_kernels::level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mls[src_idx].ptr, mls[src_idx].step/sizeof(T), (T*)mls[dst_idx].ptr, mls[dst_idx].step/sizeof(T)); - bp_kernels::level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mrs[src_idx].ptr, mrs[src_idx].step/sizeof(T), (T*)mrs[dst_idx].ptr, mrs[dst_idx].step/sizeof(T)); + level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mus[src_idx].data, mus[src_idx].step/sizeof(T), (T*)mus[dst_idx].data, mus[dst_idx].step/sizeof(T)); + level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mds[src_idx].data, mds[src_idx].step/sizeof(T), (T*)mds[dst_idx].data, mds[dst_idx].step/sizeof(T)); + level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mls[src_idx].data, mls[src_idx].step/sizeof(T), (T*)mls[dst_idx].data, mls[dst_idx].step/sizeof(T)); + level_up_message<<>>(dst_cols, dst_rows, src_rows, (const T*)mrs[src_idx].data, mrs[src_idx].step/sizeof(T), (T*)mrs[dst_idx].data, mrs[dst_idx].step/sizeof(T)); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -326,14 +310,11 @@ namespace cv { namespace gpu { namespace bp { cv::gpu::error("Unsupported message type", __FILE__, __LINE__); func(dst_idx, dst_cols, dst_rows, src_rows, mus, mds, mls, mrs, stream); } -}}} /////////////////////////////////////////////////////////////// //////////////////// calc all iterations ///////////////////// /////////////////////////////////////////////////////////////// -namespace bp_kernels -{ template __device__ void calc_min_linear_penalty(T* dst, size_t step) { @@ -427,9 +408,7 @@ namespace bp_kernels message(us + msg_step, ds - msg_step, ls + 1, dt, ls, msg_disp_step, data_disp_step); } } -} -namespace cv { namespace gpu { namespace bp { typedef void (*CalcAllIterationFunc)(int cols, int rows, int iters, DevMem2D& u, DevMem2D& d, DevMem2D& l, DevMem2D& r, const DevMem2D& data, const cudaStream_t& stream); template @@ -443,7 +422,7 @@ namespace cv { namespace gpu { namespace bp { for(int t = 0; t < iters; ++t) { - bp_kernels::one_iteration<<>>(t, (T*)u.ptr, (T*)d.ptr, (T*)l.ptr, (T*)r.ptr, u.step/sizeof(T), (const T*)data.ptr, data.step/sizeof(T), cols, rows); + one_iteration<<>>(t, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, u.step/sizeof(T), (const T*)data.data, data.step/sizeof(T), cols, rows); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -469,14 +448,11 @@ namespace cv { namespace gpu { namespace bp { cv::gpu::error("Unsupported message type", __FILE__, __LINE__); func(cols, rows, iters, u, d, l, r, data, stream); } -}}} /////////////////////////////////////////////////////////////// /////////////////////////// output //////////////////////////// /////////////////////////////////////////////////////////////// -namespace bp_kernels -{ template __global__ void output(int cols, int rows, const T* u, const T* d, const T* l, const T* r, const T* data, size_t step, short* disp, size_t res_step) { @@ -513,9 +489,7 @@ namespace bp_kernels disp[res_step * y + x] = saturate_cast(best); } } -} -namespace cv { namespace gpu { namespace bp { typedef void (*OutputFunc)(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, DevMem2D disp, const cudaStream_t& stream); template @@ -527,7 +501,7 @@ namespace cv { namespace gpu { namespace bp { grid.x = divUp(disp.cols, threads.x); grid.y = divUp(disp.rows, threads.y); - bp_kernels::output<<>>(disp.cols, disp.rows, (const T*)u.ptr, (const T*)d.ptr, (const T*)l.ptr, (const T*)r.ptr, (const T*)data.ptr, u.step/sizeof(T), (short*)disp.ptr, disp.step/sizeof(short)); + output<<>>(disp.cols, disp.rows, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, u.step/sizeof(T), (short*)disp.data, disp.step/sizeof(short)); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -552,4 +526,5 @@ namespace cv { namespace gpu { namespace bp { cv::gpu::error("Unsupported message type", __FILE__, __LINE__); func(u, d, l, r, data, disp, stream); } + }}} \ No newline at end of file diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index f368dd2..0ff8a17 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -54,7 +54,7 @@ using namespace cv::gpu; #define FLT_EPSILON 1.192092896e-07F #endif -namespace color_krnls +namespace cv { namespace gpu { namespace color { template struct ColorChannel {}; template<> struct ColorChannel @@ -95,12 +95,9 @@ namespace color_krnls { return vec.w; } -} ////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// -namespace color_krnls -{ template __global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) { @@ -123,10 +120,7 @@ namespace color_krnls *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; } } -} -namespace cv { namespace gpu { namespace color -{ template void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { @@ -136,8 +130,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::RGB2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -178,12 +172,9 @@ namespace cv { namespace gpu { namespace color RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } -}}} /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// -namespace color_krnls -{ template struct RGB5x52RGBConverter {}; template struct RGB5x52RGBConverter<5, DSTCN> { @@ -272,10 +263,7 @@ namespace color_krnls *(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter::cvt(&src.x, bidx); } } -} -namespace cv { namespace gpu { namespace color -{ template void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { @@ -285,8 +273,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::RGB5x52RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB5x52RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -313,8 +301,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::RGB2RGB5x5<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2RGB5x5<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -331,12 +319,9 @@ namespace cv { namespace gpu { namespace color RGB2RGB5x5_callers[srccn - 3][green_bits - 5](src, dst, bidx, stream); } -}}} ///////////////////////////////// Grayscale to Color //////////////////////////////// -namespace color_krnls -{ template __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) { @@ -387,10 +372,7 @@ namespace color_krnls *(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter::cvt(src); } } -} -namespace cv { namespace gpu { namespace color -{ template void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) { @@ -400,8 +382,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::Gray2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols); + Gray2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -440,8 +422,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::Gray2RGB5x5<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols); + Gray2RGB5x5<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -457,12 +439,9 @@ namespace cv { namespace gpu { namespace color Gray2RGB5x5_callers[green_bits - 5](src, dst, stream); } -}}} ///////////////////////////////// Color to Grayscale //////////////////////////////// -namespace color_krnls -{ #undef R2Y #undef G2Y #undef B2Y @@ -541,10 +520,7 @@ namespace color_krnls *(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor::cvt(&src.x, bidx); } } -} -namespace cv { namespace gpu { namespace color -{ template void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { @@ -554,8 +530,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::RGB2Gray<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2Gray<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -594,8 +570,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::RGB5x52Gray<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols); + RGB5x52Gray<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -611,12 +587,9 @@ namespace cv { namespace gpu { namespace color RGB5x52Gray_callers[green_bits - 5](src, dst, stream); } -}}} ///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// -namespace color_krnls -{ __constant__ float cYCrCbCoeffs_f[5]; __constant__ int cYCrCbCoeffs_i[5]; @@ -712,10 +685,7 @@ namespace color_krnls *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; } } -} -namespace cv { namespace gpu { namespace color -{ template void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { @@ -725,8 +695,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::RGB2YCrCb<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2YCrCb<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -741,7 +711,7 @@ namespace cv { namespace gpu { namespace color {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -755,7 +725,7 @@ namespace cv { namespace gpu { namespace color {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -769,7 +739,7 @@ namespace cv { namespace gpu { namespace color {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -783,8 +753,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::YCrCb2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + YCrCb2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -799,7 +769,7 @@ namespace cv { namespace gpu { namespace color {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -813,7 +783,7 @@ namespace cv { namespace gpu { namespace color {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } @@ -827,16 +797,13 @@ namespace cv { namespace gpu { namespace color {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } -}}} ////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// -namespace color_krnls -{ __constant__ float cXYZ_D65f[9]; __constant__ int cXYZ_D65i[9]; @@ -922,10 +889,7 @@ namespace color_krnls *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; } } -} -namespace cv { namespace gpu { namespace color -{ template void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) { @@ -935,8 +899,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::RGB2XYZ<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols); + RGB2XYZ<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -951,7 +915,7 @@ namespace cv { namespace gpu { namespace color {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -965,7 +929,7 @@ namespace cv { namespace gpu { namespace color {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -979,7 +943,7 @@ namespace cv { namespace gpu { namespace color {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) ); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -993,8 +957,8 @@ namespace cv { namespace gpu { namespace color grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - color_krnls::XYZ2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols); + XYZ2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -1009,7 +973,7 @@ namespace cv { namespace gpu { namespace color {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -1023,7 +987,7 @@ namespace cv { namespace gpu { namespace color {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } @@ -1037,16 +1001,13 @@ namespace cv { namespace gpu { namespace color {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) ); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } -}}} ////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// -namespace color_krnls -{ __constant__ int cHsvDivTable[256]; template struct RGB2HSVConvertor; @@ -1220,10 +1181,7 @@ namespace color_krnls *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; } } -} -namespace cv { namespace gpu { namespace color -{ template void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) { @@ -1234,11 +1192,11 @@ namespace cv { namespace gpu { namespace color grid.y = divUp(src.rows, threads.y); if (hrange == 180) - color_krnls::RGB2HSV<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2HSV<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); else - color_krnls::RGB2HSV<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2HSV<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -1288,7 +1246,7 @@ namespace cv { namespace gpu { namespace color 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096 }; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHsvDivTable, div_table, sizeof(div_table)) ); + cudaSafeCall( cudaMemcpyToSymbol(cHsvDivTable, div_table, sizeof(div_table)) ); RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1316,11 +1274,11 @@ namespace cv { namespace gpu { namespace color grid.y = divUp(src.rows, threads.y); if (hrange == 180) - color_krnls::HSV2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + HSV2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); else - color_krnls::HSV2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + HSV2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -1338,7 +1296,7 @@ namespace cv { namespace gpu { namespace color static const int sector_data[][3] = {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) ); HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1355,16 +1313,13 @@ namespace cv { namespace gpu { namespace color static const int sector_data[][3] = {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) ); HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } -}}} /////////////////////////////////////// RGB <-> HLS //////////////////////////////////////// -namespace color_krnls -{ template struct RGB2HLSConvertor; template struct RGB2HLSConvertor { @@ -1532,10 +1487,7 @@ namespace color_krnls *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; } } -} -namespace cv { namespace gpu { namespace color -{ template void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) { @@ -1546,11 +1498,11 @@ namespace cv { namespace gpu { namespace color grid.y = divUp(src.rows, threads.y); if (hrange == 180) - color_krnls::RGB2HLS<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2HLS<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); else - color_krnls::RGB2HLS<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + RGB2HLS<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -1591,11 +1543,11 @@ namespace cv { namespace gpu { namespace color grid.y = divUp(src.rows, threads.y); if (hrange == 180) - color_krnls::HLS2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + HLS2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); else - color_krnls::HLS2RGB<<>>(src.ptr, src.step, - dst.ptr, dst.step, src.rows, src.cols, bidx); + HLS2RGB<<>>(src.data, src.step, + dst.data, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -1613,7 +1565,7 @@ namespace cv { namespace gpu { namespace color static const int sector_data[][3]= {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) ); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1630,7 +1582,7 @@ namespace cv { namespace gpu { namespace color static const int sector_data[][3]= {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) ); + cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) ); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } diff --git a/modules/gpu/src/cuda/constantspacebp.cu b/modules/gpu/src/cuda/constantspacebp.cu index 0602834..7c9e91a 100644 --- a/modules/gpu/src/cuda/constantspacebp.cu +++ b/modules/gpu/src/cuda/constantspacebp.cu @@ -54,8 +54,9 @@ using namespace cv::gpu; #define SHRT_MAX 32767 #endif -namespace csbp_krnls +namespace cv { namespace gpu { namespace csbp { + template struct TypeLimits; template <> struct TypeLimits { @@ -65,14 +66,11 @@ namespace csbp_krnls { static __device__ float max() {return FLT_MAX;} }; -} /////////////////////////////////////////////////////////////// /////////////////////// load constants //////////////////////// /////////////////////////////////////////////////////////////// -namespace csbp_krnls -{ __constant__ int cndisp; __constant__ float cmax_data_term; @@ -91,36 +89,30 @@ namespace csbp_krnls __constant__ uchar* cleft; __constant__ uchar* cright; __constant__ uchar* ctemp; -} -namespace cv { namespace gpu { namespace csbp -{ + void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th, const DevMem2D& left, const DevMem2D& right, const DevMem2D& temp) { - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cndisp, &ndisp, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmax_data_term, &max_data_term, sizeof(float)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdata_weight, &data_weight, sizeof(float)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmax_disc_term, &max_disc_term, sizeof(float)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisc_single_jump, &disc_single_jump, sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cth, &min_disp_th, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cimg_step, &left.step, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cleft, &left.ptr, sizeof(left.ptr)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cright, &right.ptr, sizeof(right.ptr)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::ctemp, &temp.ptr, sizeof(temp.ptr)) ); + cudaSafeCall( cudaMemcpyToSymbol(cleft, &left.data, sizeof(left.data)) ); + cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) ); + cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) ); } -}}} /////////////////////////////////////////////////////////////// /////////////////////// init data cost //////////////////////// /////////////////////////////////////////////////////////////// -namespace csbp_krnls -{ template struct DataCostPerPixel { @@ -334,10 +326,8 @@ namespace csbp_krnls data_cost[cdisp_step1 * d] = saturate_cast(dline[0]); } } -} -namespace cv { namespace gpu { namespace csbp -{ + template void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream) { @@ -349,8 +339,8 @@ namespace cv { namespace gpu { namespace csbp switch (channels) { - case 1: csbp_krnls::init_data_cost<<>>(h, w, level); break; - case 3: csbp_krnls::init_data_cost<<>>(h, w, level); break; + case 1: init_data_cost<<>>(h, w, level); break; + case 3: init_data_cost<<>>(h, w, level); break; default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); } } @@ -367,8 +357,8 @@ namespace cv { namespace gpu { namespace csbp switch (channels) { - case 1: csbp_krnls::init_data_cost_reduce<<>>(level, rows, cols, h); break; - case 3: csbp_krnls::init_data_cost_reduce<<>>(level, rows, cols, h); break; + case 1: init_data_cost_reduce<<>>(level, rows, cols, h); break; + case 3: init_data_cost_reduce<<>>(level, rows, cols, h); break; default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); } } @@ -388,8 +378,8 @@ namespace cv { namespace gpu { namespace csbp }; size_t disp_step = msg_step * h; - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) ); init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream); if (stream == 0) @@ -402,9 +392,9 @@ namespace cv { namespace gpu { namespace csbp grid.y = divUp(h, threads.y); if (use_local_init_data_cost == true) - csbp_krnls::get_first_k_initial_local<<>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane); + get_first_k_initial_local<<>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane); else - csbp_krnls::get_first_k_initial_global<<>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane); + get_first_k_initial_global<<>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } @@ -421,14 +411,10 @@ namespace cv { namespace gpu { namespace csbp init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream); } -}}} - /////////////////////////////////////////////////////////////// ////////////////////// compute data cost ////////////////////// /////////////////////////////////////////////////////////////// -namespace csbp_krnls -{ template __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane) { @@ -536,10 +522,7 @@ namespace csbp_krnls data_cost[cdisp_step1 * d] = saturate_cast(dline[0]); } } -} -namespace cv { namespace gpu { namespace csbp -{ template void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/, int h, int w, int level, int nr_plane, int channels, cudaStream_t stream) @@ -552,8 +535,8 @@ namespace cv { namespace gpu { namespace csbp switch(channels) { - case 1: csbp_krnls::compute_data_cost<<>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break; - case 3: csbp_krnls::compute_data_cost<<>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break; + case 1: compute_data_cost<<>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break; + case 3: compute_data_cost<<>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break; default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); } } @@ -571,13 +554,12 @@ namespace cv { namespace gpu { namespace csbp switch (channels) { - case 1: csbp_krnls::compute_data_cost_reduce<<>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break; - case 3: csbp_krnls::compute_data_cost_reduce<<>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break; + case 1: compute_data_cost_reduce<<>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break; + case 3: compute_data_cost_reduce<<>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break; default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); } } - template void compute_data_cost_tmpl(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2, int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream) @@ -594,10 +576,10 @@ namespace cv { namespace gpu { namespace csbp size_t disp_step1 = msg_step1 * h; size_t disp_step2 = msg_step2 * h2; - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step1, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step2, &disp_step2, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step1, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step2, &msg_step2, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) ); callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream); @@ -616,15 +598,12 @@ namespace cv { namespace gpu { namespace csbp compute_data_cost_tmpl(disp_selected_pyr, data_cost, msg_step1, msg_step2, rows, cols, h, w, h2, level, nr_plane, channels, stream); } -}}} - /////////////////////////////////////////////////////////////// //////////////////////// init message ///////////////////////// /////////////////////////////////////////////////////////////// -namespace csbp_krnls -{ - template + + template __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new, const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur, T* data_cost_selected, T* disparity_selected_new, T* data_cost_new, @@ -705,10 +684,8 @@ namespace csbp_krnls data_cost, disparity_selected_cur, nr_plane, nr_plane2); } } -} -namespace cv { namespace gpu { namespace csbp -{ + template void init_message_tmpl(T* u_new, T* d_new, T* l_new, T* r_new, const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur, @@ -719,10 +696,10 @@ namespace cv { namespace gpu { namespace csbp size_t disp_step1 = msg_step1 * h; size_t disp_step2 = msg_step2 * h2; - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step1, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step2, &disp_step2, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step1, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step2, &msg_step2, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) ); dim3 threads(32, 8, 1); dim3 grid(1, 1, 1); @@ -730,7 +707,7 @@ namespace cv { namespace gpu { namespace csbp grid.x = divUp(w, threads.x); grid.y = divUp(h, threads.y); - csbp_krnls::init_message<<>>(u_new, d_new, l_new, r_new, + init_message<<>>(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur, selected_disp_pyr_new, selected_disp_pyr_cur, data_cost_selected, data_cost, @@ -761,14 +738,11 @@ namespace cv { namespace gpu { namespace csbp selected_disp_pyr_new, selected_disp_pyr_cur, data_cost_selected, data_cost, msg_step1, msg_step2, h, w, nr_plane, h2, w2, nr_plane2, stream); } -}}} /////////////////////////////////////////////////////////////// //////////////////// calc all iterations ///////////////////// /////////////////////////////////////////////////////////////// -namespace csbp_krnls -{ template __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3, const T* dst_disp, const T* src_disp, int nr_plane, T* temp) @@ -829,17 +803,15 @@ namespace csbp_krnls message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp); } } -} -namespace cv { namespace gpu { namespace csbp -{ + template void calc_all_iterations_tmpl(T* u, T* d, T* l, T* r, const T* data_cost_selected, const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream) { size_t disp_step = msg_step * h; - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) ); dim3 threads(32, 8, 1); dim3 grid(1, 1, 1); @@ -849,7 +821,7 @@ namespace cv { namespace gpu { namespace csbp for(int t = 0; t < iters; ++t) { - csbp_krnls::compute_message<<>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1); + compute_message<<>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -868,14 +840,12 @@ namespace cv { namespace gpu { namespace csbp calc_all_iterations_tmpl(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, msg_step, h, w, nr_plane, iters, stream); } -}}} /////////////////////////////////////////////////////////////// /////////////////////////// output //////////////////////////// /////////////////////////////////////////////////////////////// -namespace csbp_krnls -{ + template __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_, const T* data_cost_selected, const T* disp_selected_pyr, @@ -910,17 +880,15 @@ namespace csbp_krnls disp[res_step * y + x] = best; } } -} -namespace cv { namespace gpu { namespace csbp -{ + template void compute_disp_tmpl(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step, const DevMem2D_& disp, int nr_plane, cudaStream_t stream) { size_t disp_step = disp.rows * msg_step; - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step, sizeof(size_t)) ); - cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) ); dim3 threads(32, 8, 1); dim3 grid(1, 1, 1); @@ -928,8 +896,8 @@ namespace cv { namespace gpu { namespace csbp grid.x = divUp(disp.cols, threads.x); grid.y = divUp(disp.rows, threads.y); - csbp_krnls::compute_disp<<>>(u, d, l, r, data_cost_selected, disp_selected, - disp.ptr, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane); + compute_disp<<>>(u, d, l, r, data_cost_selected, disp_selected, + disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp index a3c5657..c38d70b 100644 --- a/modules/gpu/src/cuda/cuda_shared.hpp +++ b/modules/gpu/src/cuda/cuda_shared.hpp @@ -54,9 +54,9 @@ namespace cv typedef unsigned char uchar; typedef signed char schar; typedef unsigned short ushort; - typedef unsigned int uint; + typedef unsigned int uint; - static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } + static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; } template static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); } diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu index ad37351..d6390e5 100644 --- a/modules/gpu/src/cuda/filters.cu +++ b/modules/gpu/src/cuda/filters.cu @@ -128,8 +128,8 @@ namespace cv { namespace gpu { namespace filters dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); - filter_krnls::linearRowFilter<<>>(src.ptr, src.elem_step, - dst.ptr, dst.elem_step, anchor, src.cols, src.rows); + filter_krnls::linearRowFilter<<>>(src.data, src.step/src.elemSize(), + dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows); cudaSafeCall( cudaThreadSynchronize() ); } @@ -152,10 +152,12 @@ namespace cv { namespace gpu { namespace filters callers[ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); } - void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + template void linearRowFilter_gpu<4, uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int); + + /* void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) { linearRowFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor); - } + }*/ void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) { linearRowFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor); @@ -262,8 +264,8 @@ namespace cv { namespace gpu { namespace filters dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); - filter_krnls::linearColumnFilter<<>>(src.ptr, src.elem_step, - dst.ptr, dst.elem_step, anchor, src.cols, src.rows); + filter_krnls::linearColumnFilter<<>>(src.data, src.step/src.elemSize(), + dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows); cudaSafeCall( cudaThreadSynchronize() ); } @@ -357,7 +359,7 @@ namespace cv { namespace gpu { namespace bf void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) { cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.ptr, sizeof(table_space.ptr)) ); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) ); size_t table_space_step = table_space.step / sizeof(float); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); @@ -491,15 +493,15 @@ namespace cv { namespace gpu { namespace bf case 1: for (int i = 0; i < iters; ++i) { - bf_krnls::bilateral_filter<1><<>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); - bf_krnls::bilateral_filter<1><<>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); + bf_krnls::bilateral_filter<1><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + bf_krnls::bilateral_filter<1><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); } break; case 3: for (int i = 0; i < iters; ++i) { - bf_krnls::bilateral_filter<3><<>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); - bf_krnls::bilateral_filter<3><<>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); + bf_krnls::bilateral_filter<3><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + bf_krnls::bilateral_filter<3><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); } break; default: diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index e36a942..2326119 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -45,7 +45,7 @@ using namespace cv::gpu; /////////////////////////////////// Remap /////////////////////////////////////////////// -namespace imgproc_krnls +namespace cv { namespace gpu { namespace imgproc { texture tex_remap; @@ -121,10 +121,7 @@ namespace imgproc_krnls *(dst + y * dst_step + 3 * x + 2) = out.z; } } -} -namespace cv { namespace gpu { namespace imgproc -{ void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) { dim3 threads(16, 16, 1); @@ -132,15 +129,15 @@ namespace cv { namespace gpu { namespace imgproc grid.x = divUp(dst.cols, threads.x); grid.y = divUp(dst.rows, threads.y); - imgproc_krnls::tex_remap.filterMode = cudaFilterModeLinear; - imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::tex_remap.addressMode[1] = cudaAddressModeWrap; + tex_remap.filterMode = cudaFilterModeLinear; + tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap; cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, imgproc_krnls::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) ); + cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) ); - imgproc_krnls::remap_1c<<>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); + remap_1c<<>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); cudaSafeCall( cudaThreadSynchronize() ); - cudaSafeCall( cudaUnbindTexture(imgproc_krnls::tex_remap) ); + cudaSafeCall( cudaUnbindTexture(tex_remap) ); } void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) @@ -150,17 +147,13 @@ namespace cv { namespace gpu { namespace imgproc grid.x = divUp(dst.cols, threads.x); grid.y = divUp(dst.rows, threads.y); - imgproc_krnls::remap_3c<<>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); + remap_3c<<>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); cudaSafeCall( cudaThreadSynchronize() ); } -}}} - /////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// -namespace imgproc_krnls -{ texture tex_meanshift; __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, @@ -252,10 +245,7 @@ namespace imgproc_krnls *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); } } -} -namespace cv { namespace gpu { namespace imgproc -{ extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps) { dim3 grid(1, 1, 1); @@ -264,11 +254,11 @@ namespace cv { namespace gpu { namespace imgproc grid.y = divUp(src.rows, threads.y); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); + cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); - imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); + meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaThreadSynchronize() ); - cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); + cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); } extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) { @@ -278,18 +268,15 @@ namespace cv { namespace gpu { namespace imgproc grid.y = divUp(src.rows, threads.y); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); + cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); - imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); + meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaThreadSynchronize() ); - cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); + cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); } -}}} /////////////////////////////////// drawColorDisp /////////////////////////////////////////////// -namespace imgproc_krnls -{ template __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1) { @@ -389,10 +376,8 @@ namespace imgproc_krnls line[x >> 1] = res; } } -} -namespace cv { namespace gpu { namespace imgproc -{ + void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream) { dim3 threads(16, 16, 1); @@ -400,7 +385,7 @@ namespace cv { namespace gpu { namespace imgproc grid.x = divUp(src.cols, threads.x << 2); grid.y = divUp(src.rows, threads.y); - imgproc_krnls::drawColorDisp<<>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp); + drawColorDisp<<>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -413,17 +398,14 @@ namespace cv { namespace gpu { namespace imgproc grid.x = divUp(src.cols, threads.x << 1); grid.y = divUp(src.rows, threads.y); - imgproc_krnls::drawColorDisp<<>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp); + drawColorDisp<<>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } -}}} /////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// -namespace imgproc_krnls -{ __constant__ float cq[16]; template @@ -455,10 +437,7 @@ namespace imgproc_krnls *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v; } } -} -namespace cv { namespace gpu { namespace imgproc -{ template inline void reprojectImageTo3D_caller(const DevMem2D_& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) { @@ -467,9 +446,9 @@ namespace cv { namespace gpu { namespace imgproc grid.x = divUp(disp.cols, threads.x); grid.y = divUp(disp.rows, threads.y); - cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cq, q, 16 * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) ); - imgproc_krnls::reprojectImageTo3D<<>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols); + reprojectImageTo3D<<>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index aca1c57..7fe87bf 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -41,9 +41,7 @@ //M*/ #include "cuda_shared.hpp" -#include "saturate_cast.hpp" #include "transform.hpp" -#include "vecmath.hpp" using namespace cv::gpu; @@ -54,7 +52,7 @@ using namespace cv::gpu; ////////////////////////////////////////////////////////////////////////////////////// // Cart <-> Polar -namespace mathfunc_krnls +namespace cv { namespace gpu { namespace mathfunc { struct Nothing { @@ -133,10 +131,7 @@ namespace mathfunc_krnls yptr[y * y_step + x] = mag_data * sin_a; } } -} -namespace cv { namespace gpu { namespace mathfunc -{ template void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) { @@ -148,9 +143,9 @@ namespace cv { namespace gpu { namespace mathfunc const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f; - mathfunc_krnls::cartToPolar<<>>( - x.ptr, x.elem_step, y.ptr, y.elem_step, - mag.ptr, mag.elem_step, angle.ptr, angle.elem_step, scale, x.cols, x.rows); + cartToPolar<<>>( + x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), + mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -163,27 +158,27 @@ namespace cv { namespace gpu { namespace mathfunc { { { - cartToPolar_caller, - cartToPolar_caller + cartToPolar_caller, + cartToPolar_caller }, { - cartToPolar_caller, - cartToPolar_caller, + cartToPolar_caller, + cartToPolar_caller, } }, { { - cartToPolar_caller, - cartToPolar_caller + cartToPolar_caller, + cartToPolar_caller }, { - cartToPolar_caller, - cartToPolar_caller, + cartToPolar_caller, + cartToPolar_caller, } } }; - callers[mag.ptr == 0][magSqr][angle.ptr == 0](x, y, mag, angle, angleInDegrees, stream); + callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream); } template @@ -197,8 +192,8 @@ namespace cv { namespace gpu { namespace mathfunc const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f; - mathfunc_krnls::polarToCart<<>>(mag.ptr, mag.elem_step, - angle.ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step, mag.cols, mag.rows); + polarToCart<<>>(mag.data, mag.step/mag.elemSize(), + angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -209,19 +204,16 @@ namespace cv { namespace gpu { namespace mathfunc typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream); static const caller_t callers[2] = { - polarToCart_caller, - polarToCart_caller + polarToCart_caller, + polarToCart_caller }; - callers[mag.ptr == 0](mag, angle, x, y, angleInDegrees, stream); + callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream); } -}}} ////////////////////////////////////////////////////////////////////////////////////// // Compare -namespace mathfunc_krnls -{ template struct NotEqual { @@ -230,14 +222,11 @@ namespace mathfunc_krnls return static_cast(static_cast(src1 != src2) * 255); } }; -} -namespace cv { namespace gpu { namespace mathfunc -{ template inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) { - mathfunc_krnls::NotEqual op; + NotEqual op; transform(static_cast< DevMem2D_ >(src1), static_cast< DevMem2D_ >(src2), dst, op, 0); } diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index f9a46b4..74d5c58 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -40,16 +40,11 @@ // //M*/ -#include -#include #include "cuda_shared.hpp" -#include "cuda_runtime.h" #include "saturate_cast.hpp" -using namespace cv::gpu; +namespace cv { namespace gpu { namespace matrix_operations { -namespace matop_krnls -{ template struct shift_and_sizeof; template <> struct shift_and_sizeof { enum { shift = 0 }; }; template <> struct shift_and_sizeof { enum { shift = 0 }; }; @@ -115,14 +110,11 @@ namespace matop_krnls typedef int2 read_type; typedef short2 write_type; }; -} /////////////////////////////////////////////////////////////////////////// ////////////////////////////////// CopyTo ///////////////////////////////// /////////////////////////////////////////////////////////////////////////// -namespace matop_krnls -{ template __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels) { @@ -136,10 +128,6 @@ namespace matop_krnls mat_dst[idx] = mat_src[idx]; } } -} - -namespace cv { namespace gpu { namespace matrix_operations -{ typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream); template @@ -147,17 +135,12 @@ namespace cv { namespace gpu { namespace matrix_operations { dim3 threadsPerBlock(16,16, 1); dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); + + copy_to_with_mask<<>> + ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); + if (stream == 0) - { - ::matop_krnls::copy_to_with_mask<<>> - ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); - cudaSafeCall ( cudaThreadSynchronize() ); - } - else - { - ::matop_krnls::copy_to_with_mask<<>> - ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); - } + cudaSafeCall ( cudaThreadSynchronize() ); } void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) @@ -180,14 +163,11 @@ namespace cv { namespace gpu { namespace matrix_operations func(mat_src, mat_dst, mask, channels, stream); } -}}} /////////////////////////////////////////////////////////////////////////// ////////////////////////////////// SetTo ////////////////////////////////// /////////////////////////////////////////////////////////////////////////// -namespace matop_krnls -{ __constant__ double scalar_d[4]; template @@ -216,10 +196,6 @@ namespace matop_krnls mat[idx] = scalar_d[ x % channels ]; } } -} - -namespace cv { namespace gpu { namespace matrix_operations -{ typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream); typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream); @@ -229,16 +205,9 @@ namespace cv { namespace gpu { namespace matrix_operations dim3 threadsPerBlock(32, 8, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + set_to_with_mask<<>>((T*)mat.data, (unsigned char *)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step); if (stream == 0) - { - ::matop_krnls::set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); cudaSafeCall ( cudaThreadSynchronize() ); - } - else - { - ::matop_krnls::set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); - } - } template @@ -247,20 +216,15 @@ namespace cv { namespace gpu { namespace matrix_operations dim3 threadsPerBlock(32, 8, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + set_to_without_mask<<>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels); + if (stream == 0) - { - matop_krnls::set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); cudaSafeCall ( cudaThreadSynchronize() ); - } - else - { - matop_krnls::set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); - } } void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream) { - cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); + cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4)); static SetToFunc_without_mask tab[8] = { @@ -284,7 +248,7 @@ namespace cv { namespace gpu { namespace matrix_operations void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream) { - cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); + cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4)); static SetToFunc_with_mask tab[8] = { @@ -305,14 +269,11 @@ namespace cv { namespace gpu { namespace matrix_operations func(mat, mask, channels, stream); } -}}} /////////////////////////////////////////////////////////////////////////// //////////////////////////////// ConvertTo //////////////////////////////// /////////////////////////////////////////////////////////////////////////// -namespace matop_krnls -{ template __global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) { @@ -348,29 +309,20 @@ namespace matop_krnls } } } -} -namespace cv { namespace gpu { namespace matrix_operations -{ typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream); template void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream) { - const int shift = ::matop_krnls::ReadWriteTraits::shift; + const int shift = ReadWriteTraits::shift; dim3 block(32, 8); dim3 grid(divUp(width, block.x * shift), divUp(height, block.y)); + convert_to<<>>(src.data, src.step, dst.data, dst.step, width, height, alpha, beta); if (stream == 0) - { - matop_krnls::convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); cudaSafeCall( cudaThreadSynchronize() ); - } - else - { - matop_krnls::convert_to<<>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); - } } void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream) diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu index 3788b22..ab22186 100644 --- a/modules/gpu/src/cuda/split_merge.cu +++ b/modules/gpu/src/cuda/split_merge.cu @@ -230,9 +230,9 @@ namespace cv { namespace gpu { namespace split_merge { dim3 blockDim(32, 8); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); mergeC2_<<>>( - src[0].ptr, src[0].step, - src[1].ptr, src[1].step, - dst.rows, dst.cols, dst.ptr, dst.step); + src[0].data, src[0].step, + src[1].data, src[1].step, + dst.rows, dst.cols, dst.data, dst.step); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } @@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace split_merge { dim3 blockDim(32, 8); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); mergeC3_<<>>( - src[0].ptr, src[0].step, - src[1].ptr, src[1].step, - src[2].ptr, src[2].step, - dst.rows, dst.cols, dst.ptr, dst.step); + src[0].data, src[0].step, + src[1].data, src[1].step, + src[2].data, src[2].step, + dst.rows, dst.cols, dst.data, dst.step); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } @@ -259,11 +259,11 @@ namespace cv { namespace gpu { namespace split_merge { dim3 blockDim(32, 8); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); mergeC4_<<>>( - src[0].ptr, src[0].step, - src[1].ptr, src[1].step, - src[2].ptr, src[2].step, - src[3].ptr, src[3].step, - dst.rows, dst.cols, dst.ptr, dst.step); + src[0].data, src[0].step, + src[1].data, src[1].step, + src[2].data, src[2].step, + src[3].data, src[3].step, + dst.rows, dst.cols, dst.data, dst.step); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } @@ -433,9 +433,9 @@ namespace cv { namespace gpu { namespace split_merge { dim3 blockDim(32, 8); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); splitC2_<<>>( - src.ptr, src.step, src.rows, src.cols, - dst[0].ptr, dst[0].step, - dst[1].ptr, dst[1].step); + src.data, src.step, src.rows, src.cols, + dst[0].data, dst[0].step, + dst[1].data, dst[1].step); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } @@ -447,10 +447,10 @@ namespace cv { namespace gpu { namespace split_merge { dim3 blockDim(32, 8); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); splitC3_<<>>( - src.ptr, src.step, src.rows, src.cols, - dst[0].ptr, dst[0].step, - dst[1].ptr, dst[1].step, - dst[2].ptr, dst[2].step); + src.data, src.step, src.rows, src.cols, + dst[0].data, dst[0].step, + dst[1].data, dst[1].step, + dst[2].data, dst[2].step); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } @@ -462,11 +462,11 @@ namespace cv { namespace gpu { namespace split_merge { dim3 blockDim(32, 8); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); splitC4_<<>>( - src.ptr, src.step, src.rows, src.cols, - dst[0].ptr, dst[0].step, - dst[1].ptr, dst[1].step, - dst[2].ptr, dst[2].step, - dst[3].ptr, dst[3].step); + src.data, src.step, src.rows, src.cols, + dst[0].data, dst[0].step, + dst[1].data, dst[1].step, + dst[2].data, dst[2].step, + dst[3].data, dst[3].step); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu index 5685c54..80d457f 100644 --- a/modules/gpu/src/cuda/stereobm.cu +++ b/modules/gpu/src/cuda/stereobm.cu @@ -43,8 +43,7 @@ //#include "cuda_shared.hpp" #include "opencv2/gpu/devmem2d.hpp" #include "safe_call.hpp" -static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } - +static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; } using namespace cv::gpu; @@ -55,7 +54,7 @@ using namespace cv::gpu; #define ROWSperTHREAD 21 // the number of rows a thread will process -namespace stereobm_gpu +namespace cv { namespace gpu { namespace bm { #define BLOCK_W 128 // the thread block width (464) @@ -233,7 +232,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im } template -__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp) +__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStep disp, int maxdisp) { extern __shared__ unsigned int col_ssd_cache[]; volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; @@ -246,7 +245,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i //int Y = blockIdx.y * ROWSperTHREAD + RADIUS; unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; - unsigned char* disparImage = disp + X + Y * disp_pitch; + unsigned char* disparImage = disp.data + X + Y * disp.step; /* if (X < cwidth) { unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step; @@ -305,7 +304,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd); if (minSSD.x < minSSDImage[idx]) { - disparImage[disp_pitch * row] = (unsigned char)(d + minSSD.y); + disparImage[disp.step * row] = (unsigned char)(d + minSSD.y); minSSDImage[idx] = minSSD.x; } } @@ -313,88 +312,73 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i } // for d loop } -} - -namespace cv { namespace gpu { namespace bm +template void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream) { - template void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, const cudaStream_t & stream) - { - dim3 grid(1,1,1); - dim3 threads(BLOCK_W, 1, 1); + dim3 grid(1,1,1); + dim3 threads(BLOCK_W, 1, 1); - grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W); - grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD); + grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W); + grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD); - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) - size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); - - if (stream == 0) - { - stereobm_gpu::stereoKernel<<>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp); - cudaSafeCall( cudaThreadSynchronize() ); - } - else - { - stereobm_gpu::stereoKernel<<>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp); - } + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) + size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); - }; + stereoKernel<<>>(left.data, right.data, left.step, disp, maxdisp); + if (stream == 0) + cudaSafeCall( cudaThreadSynchronize() ); +}; - typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, const cudaStream_t & stream); +typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream); - const static kernel_caller_t callers[] = - { - 0, - kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>, - kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>, - kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>, - kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>, - kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25> - - //0,0,0, 0,0,0, 0,0,kernel_caller<9> - }; - const int calles_num = sizeof(callers)/sizeof(callers[0]); - - extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_& minSSD_buf, const cudaStream_t & stream) - { - int winsz2 = winsz >> 1; +const static kernel_caller_t callers[] = +{ + 0, + kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>, + kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>, + kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>, + kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>, + kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25> + + //0,0,0, 0,0,0, 0,0,kernel_caller<9> +}; +const int calles_num = sizeof(callers)/sizeof(callers[0]); + +extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_& minSSD_buf, cudaStream_t& stream) +{ + int winsz2 = winsz >> 1; - if (winsz2 == 0 || winsz2 >= calles_num) - cv::gpu::error("Unsupported window size", __FILE__, __LINE__); + if (winsz2 == 0 || winsz2 >= calles_num) + cv::gpu::error("Unsupported window size", __FILE__, __LINE__); - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); + //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); + //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); - cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp.rows) ); - cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); + cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) ); + cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); - cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cwidth, &left.cols, sizeof(left.cols) ) ); - cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cheight, &left.rows, sizeof(left.rows) ) ); - cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSDImage, &minSSD_buf.ptr, sizeof(minSSD_buf.ptr) ) ); + cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) ); + cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) ); + cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) ); - size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); - cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); + size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); + cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); - callers[winsz2](left, right, disp, maxdisp, stream); - } -}}} + callers[winsz2](left, right, disp, maxdisp, stream); +} ////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////// Sobel Prefiler /////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// -namespace stereobm_gpu -{ - texture texForSobel; -extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step, int width, int height, int prefilterCap) +extern "C" __global__ void prefilter_kernel(DevMem2D output, int prefilterCap) { int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < width && y < height) + if (x < output.cols && y < output.rows) { int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) + (int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) + @@ -402,48 +386,35 @@ extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step, conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255); - output[y * step + x] = conv & 0xFF; + output.ptr(y)[x] = conv & 0xFF; } } -} -namespace cv { namespace gpu { namespace bm +extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap, cudaStream_t & stream) { - extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap, const cudaStream_t & stream) - { - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForSobel, input.ptr, desc, input.cols, input.rows, input.step ) ); + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) ); - dim3 threads(16, 16, 1); - dim3 grid(1, 1, 1); + dim3 threads(16, 16, 1); + dim3 grid(1, 1, 1); - grid.x = divUp(input.cols, threads.x); - grid.y = divUp(input.rows, threads.y); + grid.x = divUp(input.cols, threads.x); + grid.y = divUp(input.rows, threads.y); - if (stream == 0) - { - stereobm_gpu::prefilter_kernel<<>>(output.ptr, output.step, output.cols, output.rows, prefilterCap); - cudaSafeCall( cudaThreadSynchronize() ); - } - else - { - stereobm_gpu::prefilter_kernel<<>>(output.ptr, output.step, output.cols, output.rows, prefilterCap); - } + prefilter_kernel<<>>(output, prefilterCap); - cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForSobel ) ); + if (stream == 0) + cudaSafeCall( cudaThreadSynchronize() ); - } + cudaSafeCall( cudaUnbindTexture (texForSobel ) ); +} -}}} ////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////// Textureness filtering //////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// -namespace stereobm_gpu -{ - texture texForTF; __device__ float sobel(int x, int y) @@ -478,7 +449,7 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz) #define RpT (2 * ROWSperTHREAD) // got experimentally -extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_step, int winsz, float threshold, int width, int height) +extern "C" __global__ void textureness_kernel(DevMem2D disp, int winsz, float threshold) { int winsz2 = winsz/2; int n_dirty_pixels = (winsz2) * 2; @@ -489,9 +460,9 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s int x = blockIdx.x * blockDim.x + threadIdx.x; int beg_row = blockIdx.y * RpT; - int end_row = min(beg_row + RpT, height); + int end_row = min(beg_row + RpT, disp.rows); - if (x < width) + if (x < disp.cols) { int y = beg_row; @@ -512,7 +483,7 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255; if (sum_win < threshold) - disp[y * disp_step + x] = 0; + disp.data[y * disp.step + x] = 0; __syncthreads(); @@ -530,45 +501,37 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s __syncthreads(); float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255; if (sum_win < threshold) - disp[y * disp_step + x] = 0; + disp.data[y * disp.step + x] = 0; __syncthreads(); } } } -} -namespace cv { namespace gpu { namespace bm +extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, cudaStream_t & stream) { - extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, const cudaStream_t & stream) - { - avgTexturenessThreshold *= winsz * winsz; + avgTexturenessThreshold *= winsz * winsz; - stereobm_gpu::texForTF.filterMode = cudaFilterModeLinear; - stereobm_gpu::texForTF.addressMode[0] = cudaAddressModeWrap; - stereobm_gpu::texForTF.addressMode[1] = cudaAddressModeWrap; + texForTF.filterMode = cudaFilterModeLinear; + texForTF.addressMode[0] = cudaAddressModeWrap; + texForTF.addressMode[1] = cudaAddressModeWrap; - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForTF, input.ptr, desc, input.cols, input.rows, input.step ) ); + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) ); - dim3 threads(128, 1, 1); - dim3 grid(1, 1, 1); + dim3 threads(128, 1, 1); + dim3 grid(1, 1, 1); - grid.x = divUp(input.cols, threads.x); - grid.y = divUp(input.rows, RpT); + grid.x = divUp(input.cols, threads.x); + grid.y = divUp(input.rows, RpT); - size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float); + size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float); + textureness_kernel<<>>(disp, winsz, avgTexturenessThreshold); - if (stream == 0) - { - stereobm_gpu::textureness_kernel<<>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows); - cudaSafeCall( cudaThreadSynchronize() ); - } - else - { - stereobm_gpu::textureness_kernel<<>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows); - } + if (stream == 0) + cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaUnbindTexture (texForTF) ); + +} - cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForTF) ); - } }}} diff --git a/modules/gpu/src/cuda/transform.hpp b/modules/gpu/src/cuda/transform.hpp index 43ed19e..2eab55b 100644 --- a/modules/gpu/src/cuda/transform.hpp +++ b/modules/gpu/src/cuda/transform.hpp @@ -44,36 +44,32 @@ #define __OPENCV_GPU_TRANSFORM_HPP__ #include "cuda_shared.hpp" -#include "saturate_cast.hpp" -#include "vecmath.hpp" -namespace cv { namespace gpu { namespace algo_krnls +namespace cv { namespace gpu { namespace device { template - static __global__ void transform(const T* src, size_t src_step, - D* dst, size_t dst_step, int width, int height, UnOp op) + static __global__ void transform(const DevMem2D_ src, PtrStep_ dst, UnOp op) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < width && y < height) + if (x < src.cols && y < src.rows) { - T src_data = src[y * src_step + x]; - dst[y * dst_step + x] = op(src_data, x, y); + T src_data = src.ptr(y)[x]; + dst.ptr(y)[x] = op(src_data, x, y); } } template - static __global__ void transform(const T1* src1, size_t src1_step, const T2* src2, size_t src2_step, - D* dst, size_t dst_step, int width, int height, BinOp op) + static __global__ void transform(const DevMem2D_ src1, const PtrStep_ src2, PtrStep_ dst, BinOp op) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < width && y < height) + if (x < src1.cols && y < src1.rows) { - T1 src1_data = src1[y * src1_step + x]; - T2 src2_data = src2[y * src2_step + x]; - dst[y * dst_step + x] = op(src1_data, src2_data, x, y); + T1 src1_data = src1.ptr(y)[x]; + T2 src2_data = src2.ptr(y)[x]; + dst.ptr(y)[x] = op(src1_data, src2_data, x, y); } } }}} @@ -83,7 +79,7 @@ namespace cv namespace gpu { template - static void transform(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, cudaStream_t stream) + static void transform2(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, cudaStream_t stream) { dim3 threads(16, 16, 1); dim3 grid(1, 1, 1); @@ -91,8 +87,7 @@ namespace cv grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - algo_krnls::transform<<>>(src.ptr, src.elem_step, - dst.ptr, dst.elem_step, src.cols, src.rows, op); + device::transform<<>>(src, dst, op); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -106,11 +101,10 @@ namespace cv grid.x = divUp(src1.cols, threads.x); grid.y = divUp(src1.rows, threads.y); - algo_krnls::transform<<>>(src1.ptr, src1.elem_step, - src2.ptr, src2.elem_step, dst.ptr, dst.elem_step, src1.cols, src1.rows, op); + device::transform<<>>(src1, src2, dst, op); if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaThreadSynchronize() ); } } } diff --git a/modules/gpu/src/cuda/vecmath.hpp b/modules/gpu/src/cuda/vecmath.hpp index e5fe86f..ff42383 100644 --- a/modules/gpu/src/cuda/vecmath.hpp +++ b/modules/gpu/src/cuda/vecmath.hpp @@ -384,7 +384,14 @@ namespace cv template static __device__ VecD saturate_cast_caller(const VecS& v) { - SatCast::cn, VecD> cast; + SatCast< + + VecTraits::cn, + + VecD + > + + cast; return cast(v); } diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index 17fb71d..961e547 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -577,7 +577,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke namespace cv { namespace gpu { namespace filters { - void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + template + void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + + //void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8s_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8s_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); @@ -653,7 +656,8 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; static const gpuFilter1D_t gpuFilter1D_callers[6][6] = { - {linearRowFilter_gpu_8u_8u_c4,linearRowFilter_gpu_8u_8s_c4,0,0,0,0}, + {linearRowFilter_gpu<4, uchar4, uchar4>/*linearRowFilter_gpu_8u_8u_c4*/,linearRowFilter_gpu_8u_8s_c4,0,0,0,0}, + {linearRowFilter_gpu_8s_8u_c4,linearRowFilter_gpu_8s_8s_c4,0,0,0,0}, {0,0,linearRowFilter_gpu_16u_16u_c2,linearRowFilter_gpu_16u_16s_c2,0,0}, {0,0,linearRowFilter_gpu_16s_16u_c2,linearRowFilter_gpu_16s_16s_c2,0,0}, diff --git a/modules/gpu/src/stereobm_gpu.cpp b/modules/gpu/src/stereobm_gpu.cpp index 9ff64ec..2e6f8bd 100644 --- a/modules/gpu/src/stereobm_gpu.cpp +++ b/modules/gpu/src/stereobm_gpu.cpp @@ -61,9 +61,9 @@ namespace cv { namespace gpu namespace bm { //extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_& minSSD_buf); - extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_& minSSD_buf, const cudaStream_t & stream); - extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap /*= 31*/, const cudaStream_t & stream); - extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, const cudaStream_t & stream); + extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_& minSSD_buf, cudaStream_t & stream); + extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D output, int prefilterCap /*= 31*/, cudaStream_t & stream); + extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, cudaStream_t & stream); } }} @@ -98,7 +98,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() return false; } -static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, const cudaStream_t & stream) +static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream) { CV_DbgAssert(left.rows == right.rows && left.cols == right.cols); CV_DbgAssert(left.type() == CV_8UC1); -- 2.7.4