From 49ec8ba742d7ec7db96f9b40c4cf5a3b10cdfc0e Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 13 Dec 2010 08:43:04 +0000 Subject: [PATCH] fixed bug in gpu filter engine (incorrect buffer type) and in vector's saturate_cast. changed buffer type in linear filters to float. added support of 1 channel image to linear filters. added support of BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border type to gpu linear filters. minor fix in tests. update comments in gpu.hpp. --- modules/gpu/include/opencv2/gpu/gpu.hpp | 50 ++- modules/gpu/src/arithm.cpp | 4 +- modules/gpu/src/cuda/filters.cu | 509 +++++++++++++++++++------ modules/gpu/src/cuda/internal_shared.hpp | 3 +- modules/gpu/src/filtering.cpp | 242 +++++++----- modules/gpu/src/imgproc_gpu.cpp | 6 + modules/gpu/src/opencv2/gpu/device/vecmath.hpp | 149 ++++---- tests/gpu/src/brute_force_matcher.cpp | 6 +- tests/gpu/src/filters.cpp | 16 +- 9 files changed, 676 insertions(+), 309 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 87c7c68..dafa5e1 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -388,7 +388,7 @@ namespace cv CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c); //! transposes the matrix - //! supports only CV_8UC1 type + //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst); //! computes element-wise absolute difference of two arrays (c = abs(a - b)) @@ -725,11 +725,11 @@ namespace cv }; //! returns the non-separable filter engine with the specified filter - CV_EXPORTS Ptr createFilter2D_GPU(const Ptr filter2D); + CV_EXPORTS Ptr createFilter2D_GPU(const Ptr filter2D, int srcType, int dstType); //! returns the separable filter engine with the specified filters CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, - const Ptr& columnFilter); + const Ptr& columnFilter, int srcType, int bufType, int dstType); //! returns horizontal 1D box filter //! supports only CV_8UC1 source type and CV_32FC1 sum type @@ -767,23 +767,40 @@ namespace cv CV_EXPORTS Ptr createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor = Point(-1,-1)); - //! returns the primitive row filter with the specified kernel + //! returns the primitive row filter with the specified kernel. + //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type. + //! there are two version of algorithm: NPP and OpenCV. + //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType, + //! otherwise calls OpenCV version. + //! NPP supports only BORDER_CONSTANT border type. + //! OpenCV version supports only CV_32F as buffer depth and + //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. CV_EXPORTS Ptr getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, - int anchor = -1); - - //! returns the primitive column filter with the specified kernel + int anchor = -1, int borderType = BORDER_CONSTANT); + + //! returns the primitive column filter with the specified kernel. + //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type. + //! there are two version of algorithm: NPP and OpenCV. + //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType, + //! otherwise calls OpenCV version. + //! NPP supports only BORDER_CONSTANT border type. + //! OpenCV version supports only CV_32F as buffer depth and + //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. CV_EXPORTS Ptr getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, - int anchor = -1); + int anchor = -1, int borderType = BORDER_CONSTANT); //! returns the separable linear filter engine CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, - const Mat& columnKernel, const Point& anchor = Point(-1,-1)); + const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, + int columnBorderType = -1); //! returns filter engine for the generalized Sobel operator - CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize); + CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! returns the Gaussian filter engine - CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0); + CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! returns maximum filter CV_EXPORTS Ptr getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); @@ -812,16 +829,19 @@ namespace cv //! applies separable 2D linear filter to the image CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, - Point anchor = Point(-1,-1)); + Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! applies generalized Sobel operator to the image - CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1); + CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! applies the vertical or horizontal Scharr operator to the image - CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1); + CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! smooths the image using Gaussian filter. - CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0); + CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! applies Laplacian operator to the image //! supports only ksize = 1 and ksize = 3 diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 90edea1..3dcae2c 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -277,12 +277,12 @@ namespace cv { namespace gpu { namespace mathfunc void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) { - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 + CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8SC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 || src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1); dst.create( src.cols, src.rows, src.type() ); - if (src.type() == CV_8UC1) + if (src.type() == CV_8UC1 || src.type() == CV_8SC1) { NppiSize sz; sz.width = src.cols; diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu index d725d0d..035bab5 100644 --- a/modules/gpu/src/cuda/filters.cu +++ b/modules/gpu/src/cuda/filters.cu @@ -43,6 +43,7 @@ #include "opencv2/gpu/devmem2d.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/vecmath.hpp" +#include "opencv2/gpu/device/limits_gpu.hpp" #include "safe_call.hpp" #include "internal_shared.hpp" @@ -50,14 +51,198 @@ using namespace cv::gpu; using namespace cv::gpu::device; -#ifndef FLT_MAX -#define FLT_MAX 3.402823466e+30F -#endif +namespace cv +{ + namespace gpu + { + namespace device + { + struct BrdReflect101 + { + explicit BrdReflect101(int len): last(len - 1) {} + + __device__ int idx_low(int i) const + { + return abs(i); + } + + __device__ int idx_high(int i) const + { + return last - abs(last - i); + } + + __device__ int idx(int i) const + { + return abs(idx_high(i)); + } + + bool is_range_safe(int mini, int maxi) const + { + return -last <= mini && maxi <= 2 * last; + } + + int last; + }; + template + struct BrdRowReflect101: BrdReflect101 + { + explicit BrdRowReflect101(int len): BrdReflect101(len) {} + + template + __device__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i)]); + } + + template + __device__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i)]); + } + }; + template + struct BrdColReflect101: BrdReflect101 + { + BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {} + + template + __device__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i) * step]); + } + + template + __device__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i) * step]); + } + + int step; + }; + + struct BrdReplicate + { + explicit BrdReplicate(int len): last(len - 1) {} + + __device__ int idx_low(int i) const + { + return max(i, 0); + } + + __device__ int idx_high(int i) const + { + return min(i, last); + } + + __device__ int idx(int i) const + { + return max(min(i, last), 0); + } + + bool is_range_safe(int mini, int maxi) const + { + return true; + } + + int last; + }; + template + struct BrdRowReplicate: BrdReplicate + { + explicit BrdRowReplicate(int len): BrdReplicate(len) {} + + template + __device__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i)]); + } + + template + __device__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i)]); + } + }; + template + struct BrdColReplicate: BrdReplicate + { + BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {} + + template + __device__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i) * step]); + } + + template + __device__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i) * step]); + } + int step; + }; + + template + struct BrdRowConstant + { + explicit BrdRowConstant(int len_, const D& val_ = VecTraits::all(0)): len(len_), val(val_) {} + + template + __device__ D at_low(int i, const T* data) const + { + return i >= 0 ? saturate_cast(data[i]) : val; + } + + template + __device__ D at_high(int i, const T* data) const + { + return i < len ? saturate_cast(data[i]) : val; + } + + bool is_range_safe(int mini, int maxi) const + { + return true; + } + + int len; + D val; + }; + template + struct BrdColConstant + { + BrdColConstant(int len_, int step_, const D& val_ = VecTraits::all(0)): len(len_), step(step_), val(val_) {} + + template + __device__ D at_low(int i, const T* data) const + { + return i >= 0 ? saturate_cast(data[i * step]) : val; + } + + template + __device__ D at_high(int i, const T* data) const + { + return i < len ? saturate_cast(data[i * step]) : val; + } + + bool is_range_safe(int mini, int maxi) const + { + return true; + } + + int len; + int step; + D val; + }; + } + } +} ///////////////////////////////////////////////////////////////////////////////////////////////// // Linear filters #define MAX_KERNEL_SIZE 16 +#define BLOCK_DIM_X 16 +#define BLOCK_DIM_Y 16 namespace filter_krnls { @@ -74,46 +259,53 @@ namespace cv { namespace gpu { namespace filters namespace filter_krnls { - template - __global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height) + template struct SmemType_ { - __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; - - const int blockStartX = blockDim.x * blockIdx.x; - const int blockStartY = blockDim.y * blockIdx.y; + typedef typename TypeVec::cn>::vec_t smem_t; + }; + template struct SmemType_ + { + typedef T smem_t; + }; + template struct SmemType + { + typedef typename SmemType_::smem_t smem_t; + }; - const int threadX = blockStartX + threadIdx.x; - const int prevThreadX = threadX - blockDim.x; - const int nextThreadX = threadX + blockDim.x; + template + __global__ void linearRowFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) + { + typedef typename SmemType::smem_t smem_t; - const int threadY = blockStartY + threadIdx.y; + __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; + + const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; + const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; - T* sDataRow = smem + threadIdx.y * blockDim.x * 3; + smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3; - if (threadY < height) + if (y < src.rows) { - const T* rowSrc = src + threadY * src_step; - - sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : VecTraits::all(0); + const T* rowSrc = src.ptr(y); - sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : VecTraits::all(0); - - sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : VecTraits::all(0); + sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc); + sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc); + sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc); __syncthreads(); - if (threadX < width) + if (x < src.cols) { typedef typename TypeVec::cn>::vec_t sum_t; sum_t sum = VecTraits::all(0); - sDataRow += threadIdx.x + blockDim.x - anchor; + sDataRow += threadIdx.x + BLOCK_DIM_X - anchor; #pragma unroll - for(int i = 0; i < KERNEL_SIZE; ++i) + for(int i = 0; i < ksize; ++i) sum = sum + sDataRow[i] * cLinearKernel[i]; - dst[threadY * dst_step + threadX] = saturate_cast(sum); + dst.ptr(y)[x] = saturate_cast(sum); } } } @@ -121,100 +313,138 @@ namespace filter_krnls namespace cv { namespace gpu { namespace filters { - template + template class B> void linearRowFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor) { - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 16; - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); - filter_krnls::linearRowFilter<<>>(src.data, src.step/src.elemSize(), - dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows); + typedef typename filter_krnls::SmemType::smem_t smem_t; + B b(src.cols); + + if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1)) + { + cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, " + "try bigger image or another border extrapolation mode", __FILE__, __LINE__); + } + + filter_krnls::linearRowFilter<<>>(src, dst, anchor, b); cudaSafeCall( cudaThreadSynchronize() ); } template - void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type) { typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor); - static const caller_t callers[] = - {linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, - linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, - linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, - linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, - linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, - linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, - linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, - linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>}; - + static const caller_t callers[3][17] = + { + { + 0, + linearRowFilter_caller<1 , T, D, BrdRowReflect101>, + linearRowFilter_caller<2 , T, D, BrdRowReflect101>, + linearRowFilter_caller<3 , T, D, BrdRowReflect101>, + linearRowFilter_caller<4 , T, D, BrdRowReflect101>, + linearRowFilter_caller<5 , T, D, BrdRowReflect101>, + linearRowFilter_caller<6 , T, D, BrdRowReflect101>, + linearRowFilter_caller<7 , T, D, BrdRowReflect101>, + linearRowFilter_caller<8 , T, D, BrdRowReflect101>, + linearRowFilter_caller<9 , T, D, BrdRowReflect101>, + linearRowFilter_caller<10, T, D, BrdRowReflect101>, + linearRowFilter_caller<11, T, D, BrdRowReflect101>, + linearRowFilter_caller<12, T, D, BrdRowReflect101>, + linearRowFilter_caller<13, T, D, BrdRowReflect101>, + linearRowFilter_caller<14, T, D, BrdRowReflect101>, + linearRowFilter_caller<15, T, D, BrdRowReflect101>, + linearRowFilter_caller<16, T, D, BrdRowReflect101>, + }, + { + 0, + linearRowFilter_caller<1 , T, D, BrdRowReplicate>, + linearRowFilter_caller<2 , T, D, BrdRowReplicate>, + linearRowFilter_caller<3 , T, D, BrdRowReplicate>, + linearRowFilter_caller<4 , T, D, BrdRowReplicate>, + linearRowFilter_caller<5 , T, D, BrdRowReplicate>, + linearRowFilter_caller<6 , T, D, BrdRowReplicate>, + linearRowFilter_caller<7 , T, D, BrdRowReplicate>, + linearRowFilter_caller<8 , T, D, BrdRowReplicate>, + linearRowFilter_caller<9 , T, D, BrdRowReplicate>, + linearRowFilter_caller<10, T, D, BrdRowReplicate>, + linearRowFilter_caller<11, T, D, BrdRowReplicate>, + linearRowFilter_caller<12, T, D, BrdRowReplicate>, + linearRowFilter_caller<13, T, D, BrdRowReplicate>, + linearRowFilter_caller<14, T, D, BrdRowReplicate>, + linearRowFilter_caller<15, T, D, BrdRowReplicate>, + linearRowFilter_caller<16, T, D, BrdRowReplicate>, + }, + { + 0, + linearRowFilter_caller<1 , T, D, BrdRowConstant>, + linearRowFilter_caller<2 , T, D, BrdRowConstant>, + linearRowFilter_caller<3 , T, D, BrdRowConstant>, + linearRowFilter_caller<4 , T, D, BrdRowConstant>, + linearRowFilter_caller<5 , T, D, BrdRowConstant>, + linearRowFilter_caller<6 , T, D, BrdRowConstant>, + linearRowFilter_caller<7 , T, D, BrdRowConstant>, + linearRowFilter_caller<8 , T, D, BrdRowConstant>, + linearRowFilter_caller<9 , T, D, BrdRowConstant>, + linearRowFilter_caller<10, T, D, BrdRowConstant>, + linearRowFilter_caller<11, T, D, BrdRowConstant>, + linearRowFilter_caller<12, T, D, BrdRowConstant>, + linearRowFilter_caller<13, T, D, BrdRowConstant>, + linearRowFilter_caller<14, T, D, BrdRowConstant>, + linearRowFilter_caller<15, T, D, BrdRowConstant>, + linearRowFilter_caller<16, T, D, BrdRowConstant>, + } + }; + loadLinearKernel(kernel, ksize); - callers[ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); } - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearRowFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);; + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); }}} namespace filter_krnls { - template - __global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height) + template + __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) { __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; - const int blockStartX = blockDim.x * blockIdx.x; - const int blockStartY = blockDim.y * blockIdx.y; - - const int threadX = blockStartX + threadIdx.x; - - const int threadY = blockStartY + threadIdx.y; - const int prevThreadY = threadY - blockDim.y; - const int nextThreadY = threadY + blockDim.y; - - const int smem_step = blockDim.x; + const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; + const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; T* sDataColumn = smem + threadIdx.x; - if (threadX < width) + if (x < src.cols) { - const T* colSrc = src + threadX; + const T* srcCol = src.ptr() + x; - sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : VecTraits::all(0); - - sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : VecTraits::all(0); - - sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : VecTraits::all(0); + sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol); + sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol); + sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol); __syncthreads(); - if (threadY < height) + if (y < src.rows) { typedef typename TypeVec::cn>::vec_t sum_t; sum_t sum = VecTraits::all(0); - sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step; + sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X; #pragma unroll - for(int i = 0; i < KERNEL_SIZE; ++i) - sum = sum + sDataColumn[i * smem_step] * cLinearKernel[i]; + for(int i = 0; i < ksize; ++i) + sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i]; - dst[threadY * dst_step + threadX] = saturate_cast(sum); + dst.ptr(y)[x] = saturate_cast(sum); } } } @@ -222,54 +452,101 @@ namespace filter_krnls namespace cv { namespace gpu { namespace filters { - template + template class B> void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor) { - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 16; - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + + B b(src.rows, src.step / src.elemSize()); - filter_krnls::linearColumnFilter<<>>(src.data, src.step/src.elemSize(), - dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows); + if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1)) + { + cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, " + "try bigger image or another border extrapolation mode", __FILE__, __LINE__); + } + + filter_krnls::linearColumnFilter<<>>(src, dst, anchor, b); cudaSafeCall( cudaThreadSynchronize() ); } template - void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) + void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type) { typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor); - static const caller_t callers[] = - {linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, - linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, - linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, - linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, - linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, - linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, - linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, - linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>}; - + static const caller_t callers[3][17] = + { + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReflect101>, + linearColumnFilter_caller<2 , T, D, BrdColReflect101>, + linearColumnFilter_caller<3 , T, D, BrdColReflect101>, + linearColumnFilter_caller<4 , T, D, BrdColReflect101>, + linearColumnFilter_caller<5 , T, D, BrdColReflect101>, + linearColumnFilter_caller<6 , T, D, BrdColReflect101>, + linearColumnFilter_caller<7 , T, D, BrdColReflect101>, + linearColumnFilter_caller<8 , T, D, BrdColReflect101>, + linearColumnFilter_caller<9 , T, D, BrdColReflect101>, + linearColumnFilter_caller<10, T, D, BrdColReflect101>, + linearColumnFilter_caller<11, T, D, BrdColReflect101>, + linearColumnFilter_caller<12, T, D, BrdColReflect101>, + linearColumnFilter_caller<13, T, D, BrdColReflect101>, + linearColumnFilter_caller<14, T, D, BrdColReflect101>, + linearColumnFilter_caller<15, T, D, BrdColReflect101>, + linearColumnFilter_caller<16, T, D, BrdColReflect101>, + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReplicate>, + linearColumnFilter_caller<2 , T, D, BrdColReplicate>, + linearColumnFilter_caller<3 , T, D, BrdColReplicate>, + linearColumnFilter_caller<4 , T, D, BrdColReplicate>, + linearColumnFilter_caller<5 , T, D, BrdColReplicate>, + linearColumnFilter_caller<6 , T, D, BrdColReplicate>, + linearColumnFilter_caller<7 , T, D, BrdColReplicate>, + linearColumnFilter_caller<8 , T, D, BrdColReplicate>, + linearColumnFilter_caller<9 , T, D, BrdColReplicate>, + linearColumnFilter_caller<10, T, D, BrdColReplicate>, + linearColumnFilter_caller<11, T, D, BrdColReplicate>, + linearColumnFilter_caller<12, T, D, BrdColReplicate>, + linearColumnFilter_caller<13, T, D, BrdColReplicate>, + linearColumnFilter_caller<14, T, D, BrdColReplicate>, + linearColumnFilter_caller<15, T, D, BrdColReplicate>, + linearColumnFilter_caller<16, T, D, BrdColReplicate>, + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColConstant>, + linearColumnFilter_caller<2 , T, D, BrdColConstant>, + linearColumnFilter_caller<3 , T, D, BrdColConstant>, + linearColumnFilter_caller<4 , T, D, BrdColConstant>, + linearColumnFilter_caller<5 , T, D, BrdColConstant>, + linearColumnFilter_caller<6 , T, D, BrdColConstant>, + linearColumnFilter_caller<7 , T, D, BrdColConstant>, + linearColumnFilter_caller<8 , T, D, BrdColConstant>, + linearColumnFilter_caller<9 , T, D, BrdColConstant>, + linearColumnFilter_caller<10, T, D, BrdColConstant>, + linearColumnFilter_caller<11, T, D, BrdColConstant>, + linearColumnFilter_caller<12, T, D, BrdColConstant>, + linearColumnFilter_caller<13, T, D, BrdColConstant>, + linearColumnFilter_caller<14, T, D, BrdColConstant>, + linearColumnFilter_caller<15, T, D, BrdColConstant>, + linearColumnFilter_caller<16, T, D, BrdColConstant>, + } + }; + loadLinearKernel(kernel, ksize); - callers[ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor); } - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); - template void linearColumnFilter_gpu(const DevMem2D&, const DevMem2D&, const float[], int , int); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); }}} ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -377,7 +654,7 @@ namespace bf_krnls } } - float minimum = FLT_MAX; + float minimum = numeric_limits_gpu::max(); int id = 0; if (cost[0] < minimum) diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp index e52ba4e..6bf060d 100644 --- a/modules/gpu/src/cuda/internal_shared.hpp +++ b/modules/gpu/src/cuda/internal_shared.hpp @@ -59,7 +59,8 @@ namespace cv enum { BORDER_REFLECT101_GPU = 0, - BORDER_REPLICATE_GPU + BORDER_REPLICATE_GPU, + BORDER_CONSTANT_GPU }; // Converts CPU border extrapolation mode into GPU internal analogue. diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index b8dcaaf..b611e49 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -48,8 +48,8 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) -Ptr cv::gpu::createFilter2D_GPU(const Ptr) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createFilter2D_GPU(const Ptr, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } @@ -58,11 +58,11 @@ Ptr cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Ptr cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } @@ -71,10 +71,10 @@ void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); } -void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point) { throw_nogpu(); } -void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); } -void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double) { throw_nogpu(); } -void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); } +void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); } +void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); } +void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); } +void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); } void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double) { throw_nogpu(); } #else @@ -133,13 +133,17 @@ namespace class Filter2DEngine_GPU : public FilterEngine_GPU { public: - Filter2DEngine_GPU(const Ptr& filter2D_) : filter2D(filter2D_) {} + Filter2DEngine_GPU(const Ptr& filter2D_, int srcType_, int dstType_) : + filter2D(filter2D_), srcType(srcType_), dstType(dstType_) + {} virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) { + CV_Assert(src.type() == srcType); + Size src_size = src.size(); - dst.create(src_size, src.type()); + dst.create(src_size, dstType); dst = Scalar(0.0); normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); @@ -151,12 +155,13 @@ namespace } Ptr filter2D; + int srcType, dstType; }; } -Ptr cv::gpu::createFilter2D_GPU(const Ptr filter2D) +Ptr cv::gpu::createFilter2D_GPU(const Ptr filter2D, int srcType, int dstType) { - return Ptr(new Filter2DEngine_GPU(filter2D)); + return Ptr(new Filter2DEngine_GPU(filter2D, srcType, dstType)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -168,8 +173,9 @@ namespace { public: SeparableFilterEngine_GPU(const Ptr& rowFilter_, - const Ptr& columnFilter_) : - rowFilter(rowFilter_), columnFilter(columnFilter_) + const Ptr& columnFilter_, int srcType_, int bufType_, int dstType_) : + rowFilter(rowFilter_), columnFilter(columnFilter_), + srcType(srcType_), bufType(bufType_), dstType(dstType_) { ksize = Size(rowFilter->ksize, columnFilter->ksize); anchor = Point(rowFilter->anchor, columnFilter->anchor); @@ -177,19 +183,20 @@ namespace virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) { + CV_Assert(src.type() == srcType); + Size src_size = src.size(); - int src_type = src.type(); - dst.create(src_size, src_type); + dst.create(src_size, dstType); dst = Scalar(0.0); - dstBuf.create(src_size, src_type); + dstBuf.create(src_size, bufType); dstBuf = Scalar(0.0); normalizeROI(roi, ksize, anchor, src_size); - srcROI = src(roi); - dstROI = dst(roi); - dstBufROI = dstBuf(roi); + GpuMat srcROI = src(roi); + GpuMat dstROI = dst(roi); + GpuMat dstBufROI = dstBuf(roi); (*rowFilter)(srcROI, dstBufROI); (*columnFilter)(dstBufROI, dstROI); @@ -197,19 +204,19 @@ namespace Ptr rowFilter; Ptr columnFilter; + int srcType, bufType, dstType; + Size ksize; Point anchor; - GpuMat dstBuf; - GpuMat srcROI; - GpuMat dstROI; - GpuMat dstBufROI; + + GpuMat dstBuf; }; } Ptr cv::gpu::createSeparableFilter_GPU(const Ptr& rowFilter, - const Ptr& columnFilter) + const Ptr& columnFilter, int srcType, int bufType, int dstType) { - return Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter)); + return Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -315,7 +322,7 @@ Ptr cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Si Ptr cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor) { Ptr boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor); - return createFilter2D_GPU(boxFilter); + return createFilter2D_GPU(boxFilter, srcType, dstType); } void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor) @@ -386,8 +393,8 @@ namespace class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU { public: - MorphologyFilterEngine_GPU(const Ptr& filter2D_, int iters_) : - Filter2DEngine_GPU(filter2D_), iters(iters_) {} + MorphologyFilterEngine_GPU(const Ptr& filter2D_, int type, int iters_) : + Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {} virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) { @@ -415,7 +422,7 @@ Ptr cv::gpu::createMorphologyFilter_GPU(int op, int type, cons Ptr filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor); - return Ptr(new MorphologyFilterEngine_GPU(filter2D, iterations)); + return Ptr(new MorphologyFilterEngine_GPU(filter2D, type, iterations)); } namespace @@ -558,7 +565,7 @@ Ptr cv::gpu::createLinearFilter_GPU(int srcType, int dstType, Ptr linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor); - return createFilter2D_GPU(linearFilter); + return createFilter2D_GPU(linearFilter, srcType, dstType); } void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor) @@ -578,10 +585,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke namespace cv { namespace gpu { namespace filters { template - void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template - void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); }}} namespace @@ -589,7 +596,7 @@ namespace typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor); - typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); + typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); class NppLinearRowFilter : public BaseRowFilter_GPU { @@ -614,35 +621,28 @@ namespace class GpuLinearRowFilter : public BaseRowFilter_GPU { public: - GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : - BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} + GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : + BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} virtual void operator()(const GpuMat& src, GpuMat& dst) { - func(src, dst, kernel.ptr(), ksize, anchor); + func(src, dst, kernel.ptr(), ksize, anchor, brd_type); } Mat kernel; gpuFilter1D_t func; + int brd_type; }; } -Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor) +Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType) { - using namespace cv::gpu::filters; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; - static const gpuFilter1D_t gpuFilter1D_callers[6][6] = - { - {linearRowFilter_gpu,linearRowFilter_gpu,0,0,0,0}, - {linearRowFilter_gpu,linearRowFilter_gpu,0,0,0,0}, - {0,0,linearRowFilter_gpu,linearRowFilter_gpu,0,0}, - {0,0,linearRowFilter_gpu,linearRowFilter_gpu,0,0}, - {0,0,0,0,linearRowFilter_gpu, linearRowFilter_gpu}, - {0,0,0,0,linearRowFilter_gpu, linearRowFilter_gpu} - }; if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4)) { + CV_Assert(borderType == BORDER_CONSTANT); + GpuMat gpu_row_krnl; int nDivisor; normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true); @@ -653,9 +653,15 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, return Ptr(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, nppFilter1D_callers[CV_MAT_CN(srcType)])); } + + CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT); + int gpuBorderType; + CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); - CV_Assert(srcType == CV_8UC4 || srcType == CV_8SC4 || srcType == CV_16UC2 || srcType == CV_16SC2 || srcType == CV_32SC1 || srcType == CV_32FC1); - CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1); + CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2 + || srcType == CV_32SC1 || srcType == CV_32FC1); + + CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType)); Mat temp(rowKernel.size(), CV_32FC1); rowKernel.convertTo(temp, CV_32FC1); @@ -663,12 +669,35 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, int ksize = cont_krnl.cols; - CV_Assert(ksize < 16); + CV_Assert(ksize > 0 && ksize <= 16); normalizeAnchor(anchor, ksize); - return Ptr(new GpuLinearRowFilter(ksize, anchor, cont_krnl, - gpuFilter1D_callers[CV_MAT_DEPTH(srcType)][CV_MAT_DEPTH(bufType)])); + gpuFilter1D_t func = 0; + + switch (srcType) + { + case CV_8UC1: + func = filters::linearRowFilter_gpu; + break; + case CV_8UC4: + func = filters::linearRowFilter_gpu; + break; + case CV_16SC1: + func = filters::linearRowFilter_gpu; + break; + case CV_16SC2: + func = filters::linearRowFilter_gpu; + break; + case CV_32SC1: + func = filters::linearRowFilter_gpu; + break; + case CV_32FC1: + func = filters::linearRowFilter_gpu; + break; + } + + return Ptr(new GpuLinearRowFilter(ksize, anchor, cont_krnl, func, gpuBorderType)); } namespace @@ -696,35 +725,28 @@ namespace class GpuLinearColumnFilter : public BaseColumnFilter_GPU { public: - GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : - BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} + GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : + BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} virtual void operator()(const GpuMat& src, GpuMat& dst) { - func(src, dst, kernel.ptr(), ksize, anchor); + func(src, dst, kernel.ptr(), ksize, anchor, brd_type); } Mat kernel; gpuFilter1D_t func; + int brd_type; }; } -Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor) +Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType) { - using namespace cv::gpu::filters; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R}; - static const gpuFilter1D_t gpuFilter1D_callers[6][6] = - { - {linearColumnFilter_gpu,linearColumnFilter_gpu,0,0,0,0}, - {linearColumnFilter_gpu,linearColumnFilter_gpu,0,0,0,0}, - {0,0,linearColumnFilter_gpu,linearColumnFilter_gpu,0,0}, - {0,0,linearColumnFilter_gpu,linearColumnFilter_gpu,0,0}, - {0,0,0,0,linearColumnFilter_gpu, linearColumnFilter_gpu}, - {0,0,0,0,linearColumnFilter_gpu, linearColumnFilter_gpu} - }; if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4)) { + CV_Assert(borderType == BORDER_CONSTANT); + GpuMat gpu_col_krnl; int nDivisor; normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true); @@ -735,9 +757,15 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds return Ptr(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, nppFilter1D_callers[CV_MAT_CN(bufType)])); } + + CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT); + int gpuBorderType; + CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); + + CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2 + || dstType == CV_32SC1 || dstType == CV_32FC1); - CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 || dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1); - CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1); + CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType)); Mat temp(columnKernel.size(), CV_32FC1); columnKernel.convertTo(temp, CV_32FC1); @@ -745,50 +773,76 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds int ksize = cont_krnl.cols; - CV_Assert(ksize < 16); + CV_Assert(ksize > 0 && ksize <= 16); normalizeAnchor(anchor, ksize); - return Ptr(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, - gpuFilter1D_callers[CV_MAT_DEPTH(bufType)][CV_MAT_DEPTH(dstType)])); + gpuFilter1D_t func = 0; + + switch (dstType) + { + case CV_8UC1: + func = filters::linearColumnFilter_gpu; + break; + case CV_8UC4: + func = filters::linearColumnFilter_gpu; + break; + case CV_16SC1: + func = filters::linearColumnFilter_gpu; + break; + case CV_16SC2: + func = filters::linearColumnFilter_gpu; + break; + case CV_32SC1: + func = filters::linearColumnFilter_gpu; + break; + case CV_32FC1: + func = filters::linearColumnFilter_gpu; + break; + } + + return Ptr(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType)); } Ptr cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, - const Point& anchor) + const Point& anchor, int rowBorderType, int columnBorderType) { + if (columnBorderType < 0) + columnBorderType = rowBorderType; + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType); - int bdepth = std::max(sdepth, ddepth); + int bdepth = CV_32F; int bufType = CV_MAKETYPE(bdepth, cn); - Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x); - Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y); + Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType); + Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType); - return createSeparableFilter_GPU(rowFilter, columnFilter); + return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType); } -void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor) +void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType) { if( ddepth < 0 ) ddepth = src.depth(); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); - Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor); - f->apply(src, dst); + Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType); + f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); } //////////////////////////////////////////////////////////////////////////////////////////////////// // Deriv Filter -Ptr cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize) +Ptr cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); - return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1)); + return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } -void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale) +void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); @@ -803,10 +857,10 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1)); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } -void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale) +void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); @@ -821,7 +875,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1)); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale) @@ -843,7 +897,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d //////////////////////////////////////////////////////////////////////////////////////////////////// // Gaussian Filter -Ptr cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2) +Ptr cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType) { int depth = CV_MAT_DEPTH(type); @@ -868,10 +922,10 @@ Ptr cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do else ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) ); - return createSeparableLinearFilter_GPU(type, type, kx, ky); + return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } -void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2) +void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType) { if (ksize.width == 1 && ksize.height == 1) { @@ -881,8 +935,8 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si dst.create(src.size(), src.type()); - Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2); - f->apply(src, dst); + Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType); + f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 361f11b..337af7a 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -972,6 +972,12 @@ bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType) gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU; return true; } + + if (cpuBorderType == cv::BORDER_CONSTANT) + { + gpuBorderType = cv::gpu::BORDER_CONSTANT_GPU; + return true; + } return false; } diff --git a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp index d73853c..19e1e88 100644 --- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp @@ -123,278 +123,295 @@ namespace cv { typedef uchar elem_t; enum {cn=1}; - static __device__ uchar all(uchar v) {return v;} + static __device__ __host__ uchar all(uchar v) {return v;} + static __device__ __host__ uchar make(uchar x) {return x;} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=1}; - static __device__ uchar1 all(uchar v) {return make_uchar1(v);} + static __device__ __host__ uchar1 all(uchar v) {return make_uchar1(v);} + static __device__ __host__ uchar1 make(uchar x) {return make_uchar1(x);} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=2}; - static __device__ uchar2 all(uchar v) {return make_uchar2(v, v);} + static __device__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);} + static __device__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=3}; - static __device__ uchar3 all(uchar v) {return make_uchar3(v, v, v);} + static __device__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);} + static __device__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=4}; - static __device__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);} + static __device__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);} + static __device__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);} }; template<> struct VecTraits { typedef char elem_t; enum {cn=1}; - static __device__ char all(char v) {return v;} + static __device__ __host__ char all(char v) {return v;} + static __device__ __host__ char make(char x) {return x;} }; template<> struct VecTraits { typedef char elem_t; enum {cn=1}; - static __device__ char1 all(char v) {return make_char1(v);} + static __device__ __host__ char1 all(char v) {return make_char1(v);} + static __device__ __host__ char1 make(char x) {return make_char1(x);} }; template<> struct VecTraits { typedef char elem_t; enum {cn=2}; - static __device__ char2 all(char v) {return make_char2(v, v);} + static __device__ __host__ char2 all(char v) {return make_char2(v, v);} + static __device__ __host__ char2 make(char x, char y) {return make_char2(x, y);} }; template<> struct VecTraits { typedef char elem_t; enum {cn=3}; - static __device__ char3 all(char v) {return make_char3(v, v, v);} + static __device__ __host__ char3 all(char v) {return make_char3(v, v, v);} + static __device__ __host__ char3 make(char x, char y, char z) {return make_char3(x, y, z);} }; template<> struct VecTraits { typedef char elem_t; enum {cn=4}; - static __device__ char4 all(char v) {return make_char4(v, v, v, v);} + static __device__ __host__ char4 all(char v) {return make_char4(v, v, v, v);} + static __device__ __host__ char4 make(char x, char y, char z, char w) {return make_char4(x, y, z, w);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=1}; - static __device__ ushort all(ushort v) {return v;} + static __device__ __host__ ushort all(ushort v) {return v;} + static __device__ __host__ ushort make(ushort x) {return x;} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=1}; - static __device__ ushort1 all(ushort v) {return make_ushort1(v);} + static __device__ __host__ ushort1 all(ushort v) {return make_ushort1(v);} + static __device__ __host__ ushort1 make(ushort x) {return make_ushort1(x);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=2}; - static __device__ ushort2 all(ushort v) {return make_ushort2(v, v);} + static __device__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);} + static __device__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=3}; - static __device__ ushort3 all(ushort v) {return make_ushort3(v, v, v);} + static __device__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);} + static __device__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=4}; - static __device__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);} + static __device__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);} + static __device__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=1}; - static __device__ short all(short v) {return v;} + static __device__ __host__ short all(short v) {return v;} + static __device__ __host__ short make(short x) {return x;} }; template<> struct VecTraits { typedef short elem_t; enum {cn=1}; - static __device__ short1 all(short v) {return make_short1(v);} + static __device__ __host__ short1 all(short v) {return make_short1(v);} + static __device__ __host__ short1 make(short x) {return make_short1(x);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=2}; - static __device__ short2 all(short v) {return make_short2(v, v);} + static __device__ __host__ short2 all(short v) {return make_short2(v, v);} + static __device__ __host__ short2 make(short x, short y) {return make_short2(x, y);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=3}; - static __device__ short3 all(short v) {return make_short3(v, v, v);} + static __device__ __host__ short3 all(short v) {return make_short3(v, v, v);} + static __device__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=4}; - static __device__ short4 all(short v) {return make_short4(v, v, v, v);} + static __device__ __host__ short4 all(short v) {return make_short4(v, v, v, v);} + static __device__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=1}; - static __device__ uint all(uint v) {return v;} + static __device__ __host__ uint all(uint v) {return v;} + static __device__ __host__ uint make(uint x) {return x;} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=1}; - static __device__ uint1 all(uint v) {return make_uint1(v);} + static __device__ __host__ uint1 all(uint v) {return make_uint1(v);} + static __device__ __host__ uint1 make(uint x) {return make_uint1(x);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=2}; - static __device__ uint2 all(uint v) {return make_uint2(v, v);} + static __device__ __host__ uint2 all(uint v) {return make_uint2(v, v);} + static __device__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=3}; - static __device__ uint3 all(uint v) {return make_uint3(v, v, v);} + static __device__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);} + static __device__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=4}; - static __device__ uint4 all(uint v) {return make_uint4(v, v, v, v);} + static __device__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);} + static __device__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=1}; - static __device__ int all(int v) {return v;} + static __device__ __host__ int all(int v) {return v;} + static __device__ __host__ int make(int x) {return x;} }; template<> struct VecTraits { typedef int elem_t; enum {cn=1}; - static __device__ int1 all(int v) {return make_int1(v);} + static __device__ __host__ int1 all(int v) {return make_int1(v);} + static __device__ __host__ int1 make(int x) {return make_int1(x);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=2}; - static __device__ int2 all(int v) {return make_int2(v, v);} + static __device__ __host__ int2 all(int v) {return make_int2(v, v);} + static __device__ __host__ int2 make(int x, int y) {return make_int2(x, y);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=3}; - static __device__ int3 all(int v) {return make_int3(v, v, v);} + static __device__ __host__ int3 all(int v) {return make_int3(v, v, v);} + static __device__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=4}; - static __device__ int4 all(int v) {return make_int4(v, v, v, v);} + static __device__ __host__ int4 all(int v) {return make_int4(v, v, v, v);} + static __device__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=1}; - static __device__ float all(float v) {return v;} + static __device__ __host__ float all(float v) {return v;} + static __device__ __host__ float make(float x) {return x;} }; template<> struct VecTraits { typedef float elem_t; enum {cn=1}; - static __device__ float1 all(float v) {return make_float1(v);} + static __device__ __host__ float1 all(float v) {return make_float1(v);} + static __device__ __host__ float1 make(float x) {return make_float1(x);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=2}; - static __device__ float2 all(float v) {return make_float2(v, v);} + static __device__ __host__ float2 all(float v) {return make_float2(v, v);} + static __device__ __host__ float2 make(float x, float y) {return make_float2(x, y);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=3}; - static __device__ float3 all(float v) {return make_float3(v, v, v);} + static __device__ __host__ float3 all(float v) {return make_float3(v, v, v);} + static __device__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=4}; - static __device__ float4 all(float v) {return make_float4(v, v, v, v);} + static __device__ __host__ float4 all(float v) {return make_float4(v, v, v, v);} + static __device__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);} }; template struct SatCast; template struct SatCast<1, VecD> { template - __device__ VecD operator()(const VecS& v) + static __device__ VecD cast(const VecS& v) { - VecD res; - res.x = saturate_cast< VecTraits::elem_t >(v.x); - return res; + typedef typename VecTraits::elem_t D; + return VecTraits::make(saturate_cast(v.x)); } }; template struct SatCast<2, VecD> { template - __device__ VecD operator()(const VecS& v) + static __device__ VecD cast(const VecS& v) { - VecD res; - res.x = saturate_cast< VecTraits::elem_t >(v.x); - res.y = saturate_cast< VecTraits::elem_t >(v.y); - return res; + typedef typename VecTraits::elem_t D; + return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y)); } }; template struct SatCast<3, VecD> { template - __device__ VecD operator()(const VecS& v) + static __device__ VecD cast(const VecS& v) { - VecD res; - res.x = saturate_cast< VecTraits::elem_t >(v.x); - res.y = saturate_cast< VecTraits::elem_t >(v.y); - res.y = saturate_cast< VecTraits::elem_t >(v.z); - return res; + typedef typename VecTraits::elem_t D; + return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z)); } }; template struct SatCast<4, VecD> { template - __device__ VecD operator()(const VecS& v) + static __device__ VecD cast(const VecS& v) { - VecD res; - res.x = saturate_cast< VecTraits::elem_t >(v.x); - res.y = saturate_cast< VecTraits::elem_t >(v.y); - res.y = saturate_cast< VecTraits::elem_t >(v.z); - res.w = saturate_cast< VecTraits::elem_t >(v.w); - return res; + typedef typename VecTraits::elem_t D; + return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } }; template static __device__ VecD saturate_cast_caller(const VecS& v) { - SatCast< - - VecTraits::cn, - - VecD - > - - cast; - return cast(v); + return SatCast::cn, VecD>::cast(v); } template static __device__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);} diff --git a/tests/gpu/src/brute_force_matcher.cpp b/tests/gpu/src/brute_force_matcher.cpp index e38032b..c7c08e9 100644 --- a/tests/gpu/src/brute_force_matcher.cpp +++ b/tests/gpu/src/brute_force_matcher.cpp @@ -107,7 +107,7 @@ protected: if (!compareMatches(matchesCPU, matchesGPU)) { - ts->printf(CvTS::LOG, "Match FAIL"); + ts->printf(CvTS::LOG, "Match FAIL\n"); ts->set_failed_test_info(CvTS::FAIL_MISMATCH); return; } @@ -119,7 +119,7 @@ protected: if (!compareMatches(knnMatchesCPU, knnMatchesGPU)) { - ts->printf(CvTS::LOG, "KNN Match FAIL"); + ts->printf(CvTS::LOG, "KNN Match FAIL\n"); ts->set_failed_test_info(CvTS::FAIL_MISMATCH); return; } @@ -131,7 +131,7 @@ protected: if (!compareMatches(radiusMatchesCPU, radiusMatchesGPU)) { - ts->printf(CvTS::LOG, "Radius Match FAIL"); + ts->printf(CvTS::LOG, "Radius Match FAIL\n"); ts->set_failed_test_info(CvTS::FAIL_MISMATCH); return; } diff --git a/tests/gpu/src/filters.cpp b/tests/gpu/src/filters.cpp index 47a6f50..67b9603 100644 --- a/tests/gpu/src/filters.cpp +++ b/tests/gpu/src/filters.cpp @@ -80,7 +80,8 @@ protected: double res = norm(m1ROI, m2ROI, NORM_INF); - if (res <= 1) + // Max difference (2.0) in GaussianBlur + if (res <= 2) return CvTS::OK; ts->printf(CvTS::LOG, "Norm: %f\n", res); @@ -166,8 +167,6 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest int test(const Mat& img) { - if (img.type() != CV_8UC1) - return CvTS::OK; int ksizes[] = {3, 5, 7}; int ksizes_num = sizeof(ksizes) / sizeof(int); @@ -183,10 +182,8 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest cv::Sobel(img, cpudst, -1, dx, dy, ksizes[i]); GpuMat gpu1(img); - gpu1.convertTo(gpu1, CV_32S); GpuMat gpudst; cv::gpu::Sobel(gpu1, gpudst, -1, dx, dy, ksizes[i]); - gpudst.convertTo(gpudst, CV_8U); if (CheckNorm(cpudst, gpudst, Size(ksizes[i], ksizes[i])) != CvTS::OK) test_res = CvTS::FAIL_GENERIC; @@ -204,20 +201,15 @@ struct CV_GpuNppImageScharrTest : public CV_GpuNppFilterTest int test(const Mat& img) { - if (img.type() != CV_8UC1) - return CvTS::OK; - int dx = 1, dy = 0; Mat cpudst; cv::Scharr(img, cpudst, -1, dx, dy); GpuMat gpu1(img); - gpu1.convertTo(gpu1, CV_32S); GpuMat gpudst; cv::gpu::Scharr(gpu1, gpudst, -1, dx, dy); - gpudst.convertTo(gpudst, CV_8U); - + return CheckNorm(cpudst, gpudst, Size(3, 3)); } }; @@ -244,7 +236,7 @@ struct CV_GpuNppImageGaussianBlurTest : public CV_GpuNppFilterTest { cv::Size ksize(ksizes[i], ksizes[j]); - ts->printf(CvTS::LOG, "ksize = (%dx%d)\t", ksizes[i], ksizes[j]); + ts->printf(CvTS::LOG, "ksize = (%dx%d)\t\n", ksizes[i], ksizes[j]); Mat cpudst; cv::GaussianBlur(img, cpudst, ksize, sigma1); -- 2.7.4