fixed bug in gpu filter engine (incorrect buffer type) and in vector's saturate_cast.
authorVladislav Vinogradov <no@email>
Mon, 13 Dec 2010 08:43:04 +0000 (08:43 +0000)
committerVladislav Vinogradov <no@email>
Mon, 13 Dec 2010 08:43:04 +0000 (08:43 +0000)
changed buffer type in linear filters to float.
added support of 1 channel image to linear filters.
added support of BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border type to gpu linear filters.
minor fix in tests.
update comments in gpu.hpp.

modules/gpu/include/opencv2/gpu/gpu.hpp
modules/gpu/src/arithm.cpp
modules/gpu/src/cuda/filters.cu
modules/gpu/src/cuda/internal_shared.hpp
modules/gpu/src/filtering.cpp
modules/gpu/src/imgproc_gpu.cpp
modules/gpu/src/opencv2/gpu/device/vecmath.hpp
tests/gpu/src/brute_force_matcher.cpp
tests/gpu/src/filters.cpp

index 87c7c68..dafa5e1 100644 (file)
@@ -388,7 +388,7 @@ namespace cv
         CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c);\r
 \r
         //! transposes the matrix\r
-        //! supports only CV_8UC1 type\r
+        //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type\r
         CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst);\r
 \r
         //! computes element-wise absolute difference of two arrays (c = abs(a - b))\r
@@ -725,11 +725,11 @@ namespace cv
         };\r
 \r
         //! returns the non-separable filter engine with the specified filter\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);\r
+        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D, int srcType, int dstType);\r
 \r
         //! returns the separable filter engine with the specified filters\r
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, \r
-            const Ptr<BaseColumnFilter_GPU>& columnFilter);\r
+            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);\r
 \r
         //! returns horizontal 1D box filter\r
         //! supports only CV_8UC1 source type and CV_32FC1 sum type\r
@@ -767,23 +767,40 @@ namespace cv
         CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, \r
             const Point& anchor = Point(-1,-1));\r
 \r
-        //! returns the primitive row filter with the specified kernel\r
+        //! returns the primitive row filter with the specified kernel.\r
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.\r
+        //! there are two version of algorithm: NPP and OpenCV.\r
+        //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,\r
+        //! otherwise calls OpenCV version.\r
+        //! NPP supports only BORDER_CONSTANT border type.\r
+        //! OpenCV version supports only CV_32F as buffer depth and \r
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.\r
         CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, \r
-            int anchor = -1);\r
-\r
-        //! returns the primitive column filter with the specified kernel\r
+            int anchor = -1, int borderType = BORDER_CONSTANT);\r
+\r
+        //! returns the primitive column filter with the specified kernel.\r
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.\r
+        //! there are two version of algorithm: NPP and OpenCV.\r
+        //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,\r
+        //! otherwise calls OpenCV version.\r
+        //! NPP supports only BORDER_CONSTANT border type.\r
+        //! OpenCV version supports only CV_32F as buffer depth and \r
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.\r
         CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, \r
-            int anchor = -1);\r
+            int anchor = -1, int borderType = BORDER_CONSTANT);\r
 \r
         //! returns the separable linear filter engine\r
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, \r
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1));\r
+            const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,\r
+            int columnBorderType = -1);\r
 \r
         //! returns filter engine for the generalized Sobel operator\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize);\r
+        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, \r
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
 \r
         //! returns the Gaussian filter engine\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0);\r
+        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, \r
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
 \r
         //! returns maximum filter\r
         CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));\r
@@ -812,16 +829,19 @@ namespace cv
 \r
         //! applies separable 2D linear filter to the image\r
         CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, \r
-            Point anchor = Point(-1,-1));\r
+            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
 \r
         //! applies generalized Sobel operator to the image\r
-        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1);\r
+        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, \r
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
 \r
         //! applies the vertical or horizontal Scharr operator to the image\r
-        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1);\r
+        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, \r
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
 \r
         //! smooths the image using Gaussian filter.\r
-        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0);\r
+        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, \r
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
 \r
         //! applies Laplacian operator to the image\r
         //! supports only ksize = 1 and ksize = 3\r
index 90edea1..3dcae2c 100644 (file)
@@ -277,12 +277,12 @@ namespace cv { namespace gpu { namespace mathfunc
 \r
 void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)\r
 {\r
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 \r
+    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8SC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 \r
         || src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1);\r
 \r
     dst.create( src.cols, src.rows, src.type() );\r
 \r
-    if (src.type() == CV_8UC1)\r
+    if (src.type() == CV_8UC1 || src.type() == CV_8SC1)\r
     {\r
         NppiSize sz;\r
         sz.width  = src.cols;\r
index d725d0d..035bab5 100644 (file)
@@ -43,6 +43,7 @@
 #include "opencv2/gpu/devmem2d.hpp"\r
 #include "opencv2/gpu/device/saturate_cast.hpp"\r
 #include "opencv2/gpu/device/vecmath.hpp"\r
+#include "opencv2/gpu/device/limits_gpu.hpp"\r
 \r
 #include "safe_call.hpp"\r
 #include "internal_shared.hpp"\r
 using namespace cv::gpu;\r
 using namespace cv::gpu::device;\r
 \r
-#ifndef FLT_MAX\r
-#define FLT_MAX 3.402823466e+30F\r
-#endif\r
+namespace cv \r
+{ \r
+    namespace gpu \r
+    {\r
+        namespace device\r
+        {\r
+            struct BrdReflect101 \r
+            {\r
+                explicit BrdReflect101(int len): last(len - 1) {}\r
+\r
+                __device__ int idx_low(int i) const\r
+                {\r
+                    return abs(i);\r
+                }\r
+\r
+                __device__ int idx_high(int i) const \r
+                {\r
+                    return last - abs(last - i);\r
+                }\r
+\r
+                __device__ int idx(int i) const\r
+                {\r
+                    return abs(idx_high(i));\r
+                }\r
+\r
+                bool is_range_safe(int mini, int maxi) const \r
+                {\r
+                    return -last <= mini && maxi <= 2 * last;\r
+                }\r
+\r
+                int last;\r
+            };\r
+            template <typename D>\r
+            struct BrdRowReflect101: BrdReflect101\r
+            {\r
+                explicit BrdRowReflect101(int len): BrdReflect101(len) {}\r
+\r
+                template <typename T>\r
+                __device__ D at_low(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_low(i)]);\r
+                }\r
+\r
+                template <typename T>\r
+                __device__ D at_high(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_high(i)]);\r
+                }\r
+            };\r
+            template <typename D>\r
+            struct BrdColReflect101: BrdReflect101\r
+            {\r
+                BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}\r
+\r
+                template <typename T>\r
+                __device__ D at_low(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_low(i) * step]);\r
+                }\r
+\r
+                template <typename T>\r
+                __device__ D at_high(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_high(i) * step]);\r
+                }\r
+\r
+                int step;\r
+            };\r
+\r
+            struct BrdReplicate\r
+            {\r
+                explicit BrdReplicate(int len): last(len - 1) {}\r
+\r
+                __device__ int idx_low(int i) const\r
+                {\r
+                    return max(i, 0);\r
+                }\r
+\r
+                __device__ int idx_high(int i) const \r
+                {\r
+                    return min(i, last);\r
+                }\r
+\r
+                __device__ int idx(int i) const\r
+                {\r
+                    return max(min(i, last), 0);\r
+                }\r
+\r
+                bool is_range_safe(int mini, int maxi) const \r
+                {\r
+                    return true;\r
+                }\r
+\r
+                int last;\r
+            };\r
+            template <typename D>\r
+            struct BrdRowReplicate: BrdReplicate\r
+            {\r
+                explicit BrdRowReplicate(int len): BrdReplicate(len) {}\r
+\r
+                template <typename T>\r
+                __device__ D at_low(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_low(i)]);\r
+                }\r
+\r
+                template <typename T>\r
+                __device__ D at_high(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_high(i)]);\r
+                }\r
+            };\r
+            template <typename D>\r
+            struct BrdColReplicate: BrdReplicate\r
+            {\r
+                BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}\r
+\r
+                template <typename T>\r
+                __device__ D at_low(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_low(i) * step]);\r
+                }\r
+\r
+                template <typename T>\r
+                __device__ D at_high(int i, const T* data) const \r
+                {\r
+                    return saturate_cast<D>(data[idx_high(i) * step]);\r
+                }\r
+                int step;\r
+            };\r
+\r
+            template <typename D>\r
+            struct BrdRowConstant\r
+            {\r
+                explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}\r
+\r
+                template <typename T>\r
+                __device__ D at_low(int i, const T* data) const \r
+                {\r
+                    return i >= 0 ? saturate_cast<D>(data[i]) : val;\r
+                }\r
+\r
+                template <typename T>\r
+                __device__ D at_high(int i, const T* data) const \r
+                {\r
+                    return i < len ? saturate_cast<D>(data[i]) : val;\r
+                }\r
+\r
+                bool is_range_safe(int mini, int maxi) const \r
+                {\r
+                    return true;\r
+                }\r
+\r
+                int len;\r
+                D val;\r
+            };\r
+            template <typename D>\r
+            struct BrdColConstant\r
+            {\r
+                BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}\r
+\r
+                template <typename T>\r
+                __device__ D at_low(int i, const T* data) const \r
+                {\r
+                    return i >= 0 ? saturate_cast<D>(data[i * step]) : val;\r
+                }\r
+\r
+                template <typename T>\r
+                __device__ D at_high(int i, const T* data) const \r
+                {\r
+                    return i < len ? saturate_cast<D>(data[i * step]) : val;\r
+                }\r
+\r
+                bool is_range_safe(int mini, int maxi) const \r
+                {\r
+                    return true;\r
+                }\r
+\r
+                int len;\r
+                int step;\r
+                D val;\r
+            };\r
+        }\r
+    }\r
+}\r
 \r
 /////////////////////////////////////////////////////////////////////////////////////////////////\r
 // Linear filters\r
 \r
 #define MAX_KERNEL_SIZE 16\r
+#define BLOCK_DIM_X 16\r
+#define BLOCK_DIM_Y 16\r
 \r
 namespace filter_krnls\r
 {\r
@@ -74,46 +259,53 @@ namespace cv { namespace gpu { namespace filters
 \r
 namespace filter_krnls\r
 {\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>\r
-    __global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)\r
+    template <typename T, size_t size> struct SmemType_\r
     {\r
-        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];\r
-        \r
-        const int blockStartX = blockDim.x * blockIdx.x;\r
-        const int blockStartY = blockDim.y * blockIdx.y;\r
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t smem_t;\r
+    };\r
+    template <typename T> struct SmemType_<T, 4>\r
+    {\r
+        typedef T smem_t;\r
+    };\r
+    template <typename T> struct SmemType\r
+    {\r
+        typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t;\r
+    };\r
 \r
-               const int threadX = blockStartX + threadIdx.x;\r
-        const int prevThreadX = threadX - blockDim.x;\r
-        const int nextThreadX = threadX + blockDim.x;\r
+    template <int ksize, typename T, typename D, typename B>\r
+    __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)\r
+    {\r
+        typedef typename SmemType<T>::smem_t smem_t;\r
 \r
-               const int threadY = blockStartY + threadIdx.y;\r
+        __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];\r
+        \r
+               const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;\r
+               const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;\r
 \r
-        T* sDataRow = smem + threadIdx.y * blockDim.x * 3;\r
+        smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;\r
 \r
-        if (threadY < height)\r
+        if (y < src.rows)\r
         {\r
-            const T* rowSrc = src + threadY * src_step;\r
-\r
-            sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : VecTraits<T>::all(0);\r
+            const T* rowSrc = src.ptr(y);\r
 \r
-            sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : VecTraits<T>::all(0);\r
-\r
-            sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : VecTraits<T>::all(0);\r
+            sDataRow[threadIdx.x                  ] = b.at_low(x - BLOCK_DIM_X, rowSrc);\r
+            sDataRow[threadIdx.x + BLOCK_DIM_X    ] = b.at_high(x, rowSrc);\r
+            sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);\r
 \r
             __syncthreads();\r
 \r
-            if (threadX < width)\r
+            if (x < src.cols)\r
             {\r
                 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;\r
                 sum_t sum = VecTraits<sum_t>::all(0);\r
 \r
-                sDataRow += threadIdx.x + blockDim.x - anchor;\r
+                sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;\r
 \r
                 #pragma unroll\r
-                for(int i = 0; i < KERNEL_SIZE; ++i)\r
+                for(int i = 0; i < ksize; ++i)\r
                     sum = sum + sDataRow[i] * cLinearKernel[i];\r
 \r
-                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);\r
+                dst.ptr(y)[x] = saturate_cast<D>(sum);\r
             }\r
         }\r
     }\r
@@ -121,100 +313,138 @@ namespace filter_krnls
 \r
 namespace cv { namespace gpu { namespace filters\r
 {\r
-    template <int KERNEL_SIZE, typename T, typename D>\r
+    template <int ksize, typename T, typename D, template<typename> class B>\r
     void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)\r
     {\r
-        const int BLOCK_DIM_X = 16;\r
-        const int BLOCK_DIM_Y = 16;\r
-\r
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);\r
-        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
+        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
 \r
-        filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(), \r
-            dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);\r
+        typedef typename filter_krnls::SmemType<T>::smem_t smem_t;\r
+        B<smem_t> b(src.cols);\r
+\r
+        if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))\r
+        {\r
+            cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, "\r
+                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);\r
+        }\r
+\r
+        filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
 \r
     template <typename T, typename D>\r
-    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)\r
+    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)\r
     {\r
         typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);\r
-        static const caller_t callers[] = \r
-        {linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, \r
-         linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, \r
-         linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, \r
-         linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, \r
-         linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, \r
-         linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, \r
-         linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, \r
-         linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>};\r
-\r
+        static const caller_t callers[3][17] = \r
+        {\r
+            {\r
+                0, \r
+                linearRowFilter_caller<1 , T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<2 , T, D, BrdRowReflect101>,\r
+                linearRowFilter_caller<3 , T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<4 , T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<5 , T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<6 , T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<7 , T, D, BrdRowReflect101>,\r
+                linearRowFilter_caller<8 , T, D, BrdRowReflect101>,\r
+                linearRowFilter_caller<9 , T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<10, T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<11, T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<12, T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<13, T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<14, T, D, BrdRowReflect101>,\r
+                linearRowFilter_caller<15, T, D, BrdRowReflect101>, \r
+                linearRowFilter_caller<16, T, D, BrdRowReflect101>,\r
+            },            \r
+            {\r
+                0, \r
+                linearRowFilter_caller<1 , T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<2 , T, D, BrdRowReplicate>,\r
+                linearRowFilter_caller<3 , T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<4 , T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<5 , T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<6 , T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<7 , T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<8 , T, D, BrdRowReplicate>,\r
+                linearRowFilter_caller<9 , T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<10, T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<11, T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<12, T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<13, T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<14, T, D, BrdRowReplicate>,\r
+                linearRowFilter_caller<15, T, D, BrdRowReplicate>, \r
+                linearRowFilter_caller<16, T, D, BrdRowReplicate>,\r
+            },            \r
+            {\r
+                0, \r
+                linearRowFilter_caller<1 , T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<2 , T, D, BrdRowConstant>,\r
+                linearRowFilter_caller<3 , T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<4 , T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<5 , T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<6 , T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<7 , T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<8 , T, D, BrdRowConstant>,\r
+                linearRowFilter_caller<9 , T, D, BrdRowConstant>,\r
+                linearRowFilter_caller<10, T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<11, T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<12, T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<13, T, D, BrdRowConstant>,\r
+                linearRowFilter_caller<14, T, D, BrdRowConstant>,\r
+                linearRowFilter_caller<15, T, D, BrdRowConstant>, \r
+                linearRowFilter_caller<16, T, D, BrdRowConstant>,\r
+            }\r
+        };\r
+        \r
         loadLinearKernel(kernel, ksize);\r
 \r
-        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);\r
+        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);\r
     }\r
 \r
-    template void linearRowFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-\r
-    template void linearRowFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-\r
-    template void linearRowFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearRowFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
+    template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);;\r
+    template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearRowFilter_gpu<int   , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
 }}}\r
 \r
 namespace filter_krnls\r
 {\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>\r
-    __global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)\r
+    template <int ksize, typename T, typename D, typename B>\r
+    __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)\r
     {\r
         __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];\r
         \r
-        const int blockStartX = blockDim.x * blockIdx.x;\r
-        const int blockStartY = blockDim.y * blockIdx.y;\r
-\r
-               const int threadX = blockStartX + threadIdx.x;\r
-\r
-               const int threadY = blockStartY + threadIdx.y;\r
-        const int prevThreadY = threadY - blockDim.y;\r
-        const int nextThreadY = threadY + blockDim.y;\r
-\r
-        const int smem_step = blockDim.x;\r
+               const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;\r
+               const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;\r
 \r
         T* sDataColumn = smem + threadIdx.x;\r
 \r
-        if (threadX < width)\r
+        if (x < src.cols)\r
         {\r
-            const T* colSrc = src + threadX;\r
+            const T* srcCol = src.ptr() + x;\r
 \r
-            sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : VecTraits<T>::all(0);\r
-\r
-            sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : VecTraits<T>::all(0);\r
-\r
-            sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : VecTraits<T>::all(0);\r
+            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol);\r
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol);\r
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol);\r
 \r
             __syncthreads();\r
 \r
-            if (threadY < height)\r
+            if (y < src.rows)\r
             {\r
                 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;\r
                 sum_t sum = VecTraits<sum_t>::all(0);\r
 \r
-                sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step;\r
+                sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;\r
 \r
                 #pragma unroll\r
-                for(int i = 0; i < KERNEL_SIZE; ++i)\r
-                    sum = sum + sDataColumn[i * smem_step] * cLinearKernel[i];\r
+                for(int i = 0; i < ksize; ++i)\r
+                    sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];\r
 \r
-                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);\r
+                dst.ptr(y)[x] = saturate_cast<D>(sum);\r
             }\r
         }\r
     }\r
@@ -222,54 +452,101 @@ namespace filter_krnls
 \r
 namespace cv { namespace gpu { namespace filters\r
 {\r
-    template <int KERNEL_SIZE, typename T, typename D>\r
+    template <int ksize, typename T, typename D, template<typename> class B>\r
     void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)\r
     {\r
-        const int BLOCK_DIM_X = 16;\r
-        const int BLOCK_DIM_Y = 16;\r
-\r
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);\r
-        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
+        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
+\r
+        B<T> b(src.rows, src.step / src.elemSize());\r
 \r
-        filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(), \r
-            dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);\r
+        if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))\r
+        {\r
+            cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "\r
+                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);\r
+        }\r
+\r
+        filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
 \r
     template <typename T, typename D>\r
-    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)\r
+    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)\r
     {\r
         typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);\r
-        static const caller_t callers[] = \r
-        {linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, \r
-         linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, \r
-         linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, \r
-         linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, \r
-         linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, \r
-         linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, \r
-         linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, \r
-         linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>};\r
-\r
+        static const caller_t callers[3][17] = \r
+        {\r
+            {\r
+                0, \r
+                linearColumnFilter_caller<1 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<2 , T, D, BrdColReflect101>,\r
+                linearColumnFilter_caller<3 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<4 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<5 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<6 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<7 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<8 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<9 , T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<10, T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<11, T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<12, T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<13, T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<14, T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<15, T, D, BrdColReflect101>, \r
+                linearColumnFilter_caller<16, T, D, BrdColReflect101>, \r
+            },            \r
+            {\r
+                0, \r
+                linearColumnFilter_caller<1 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<2 , T, D, BrdColReplicate>,\r
+                linearColumnFilter_caller<3 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<4 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<5 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<6 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<7 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<8 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<9 , T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<10, T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<11, T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<12, T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<13, T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<14, T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<15, T, D, BrdColReplicate>, \r
+                linearColumnFilter_caller<16, T, D, BrdColReplicate>, \r
+            },            \r
+            {\r
+                0, \r
+                linearColumnFilter_caller<1 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<2 , T, D, BrdColConstant>,\r
+                linearColumnFilter_caller<3 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<4 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<5 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<6 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<7 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<8 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<9 , T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<10, T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<11, T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<12, T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<13, T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<14, T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<15, T, D, BrdColConstant>, \r
+                linearColumnFilter_caller<16, T, D, BrdColConstant>, \r
+            }\r
+        };\r
+        \r
         loadLinearKernel(kernel, ksize);\r
 \r
-        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);\r
+        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);\r
     }\r
 \r
-    template void linearColumnFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-\r
-    template void linearColumnFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-\r
-    template void linearColumnFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
-    template void linearColumnFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);\r
+    template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearColumnFilter_gpu<float , int   >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
+    template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
 }}}\r
 \r
 /////////////////////////////////////////////////////////////////////////////////////////////////\r
@@ -377,7 +654,7 @@ namespace bf_krnls
                     }\r
                 }\r
 \r
-                float minimum = FLT_MAX;\r
+                float minimum = numeric_limits_gpu<float>::max();\r
                 int id = 0;\r
 \r
                 if (cost[0] < minimum)\r
index e52ba4e..6bf060d 100644 (file)
@@ -59,7 +59,8 @@ namespace cv
         enum \r
         {\r
             BORDER_REFLECT101_GPU = 0,\r
-            BORDER_REPLICATE_GPU\r
+            BORDER_REPLICATE_GPU,\r
+            BORDER_CONSTANT_GPU\r
         };\r
         \r
         // Converts CPU border extrapolation mode into GPU internal analogue.\r
index b8dcaaf..b611e49 100644 (file)
@@ -48,8 +48,8 @@ using namespace cv::gpu;
 \r
 #if !defined (HAVE_CUDA)\r
 \r
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
+Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }\r
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }\r
 Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }\r
@@ -58,11 +58,11 @@ Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
 Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }\r
 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }\r
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }\r
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }\r
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }\r
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
+Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
+Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }\r
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }\r
 Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }\r
 \r
@@ -71,10 +71,10 @@ void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog
 void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }\r
 void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }\r
 void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); }\r
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point) { throw_nogpu(); }\r
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); }\r
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double) { throw_nogpu(); }\r
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); }\r
+void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); }\r
+void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); }\r
+void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); }\r
+void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); }\r
 void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double) { throw_nogpu(); }\r
 \r
 #else\r
@@ -133,13 +133,17 @@ namespace
     class Filter2DEngine_GPU : public FilterEngine_GPU\r
     {\r
     public:\r
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_) : filter2D(filter2D_) {}\r
+        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) : \r
+            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)\r
+        {}\r
 \r
         virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))\r
         {\r
+            CV_Assert(src.type() == srcType);\r
+\r
             Size src_size = src.size();\r
 \r
-            dst.create(src_size, src.type());\r
+            dst.create(src_size, dstType);\r
             dst = Scalar(0.0);\r
 \r
             normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);\r
@@ -151,12 +155,13 @@ namespace
         }\r
 \r
         Ptr<BaseFilter_GPU> filter2D;\r
+        int srcType, dstType;\r
     };\r
 }\r
 \r
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D)\r
+Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D, int srcType, int dstType)\r
 {\r
-    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D));\r
+    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));\r
 }\r
 \r
 ////////////////////////////////////////////////////////////////////////////////////////////////////\r
@@ -168,8 +173,9 @@ namespace
     {\r
     public:\r
         SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, \r
-                                       const Ptr<BaseColumnFilter_GPU>& columnFilter_) :\r
-            rowFilter(rowFilter_), columnFilter(columnFilter_)\r
+            const Ptr<BaseColumnFilter_GPU>& columnFilter_, int srcType_, int bufType_, int dstType_) :\r
+            rowFilter(rowFilter_), columnFilter(columnFilter_), \r
+            srcType(srcType_), bufType(bufType_), dstType(dstType_)\r
         {\r
             ksize = Size(rowFilter->ksize, columnFilter->ksize);\r
             anchor = Point(rowFilter->anchor, columnFilter->anchor);\r
@@ -177,19 +183,20 @@ namespace
 \r
         virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))\r
         {\r
+            CV_Assert(src.type() == srcType);\r
+\r
             Size src_size = src.size();\r
-            int src_type = src.type();\r
 \r
-            dst.create(src_size, src_type);\r
+            dst.create(src_size, dstType);\r
             dst = Scalar(0.0);\r
-            dstBuf.create(src_size, src_type);\r
+            dstBuf.create(src_size, bufType);\r
             dstBuf = Scalar(0.0);\r
 \r
             normalizeROI(roi, ksize, anchor, src_size);\r
 \r
-            srcROI = src(roi);\r
-            dstROI = dst(roi);\r
-            dstBufROI = dstBuf(roi);\r
+            GpuMat srcROI = src(roi);\r
+            GpuMat dstROI = dst(roi);\r
+            GpuMat dstBufROI = dstBuf(roi);\r
             \r
             (*rowFilter)(srcROI, dstBufROI);\r
             (*columnFilter)(dstBufROI, dstROI);\r
@@ -197,19 +204,19 @@ namespace
 \r
         Ptr<BaseRowFilter_GPU> rowFilter;\r
         Ptr<BaseColumnFilter_GPU> columnFilter;\r
+        int srcType, bufType, dstType;\r
+\r
         Size ksize;\r
         Point anchor;\r
-        GpuMat dstBuf;\r
-        GpuMat srcROI;\r
-        GpuMat dstROI;\r
-        GpuMat dstBufROI;\r
+\r
+        GpuMat dstBuf;        \r
     };\r
 }\r
 \r
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, \r
-    const Ptr<BaseColumnFilter_GPU>& columnFilter)\r
+    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)\r
 {\r
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));\r
+    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));\r
 }\r
 \r
 ////////////////////////////////////////////////////////////////////////////////////////////////////\r
@@ -315,7 +322,7 @@ Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Si
 Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)\r
 {\r
     Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);\r
-    return createFilter2D_GPU(boxFilter);\r
+    return createFilter2D_GPU(boxFilter, srcType, dstType);\r
 }\r
 \r
 void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor)\r
@@ -386,8 +393,8 @@ namespace
     class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU\r
     {\r
     public:\r
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int iters_) : \r
-          Filter2DEngine_GPU(filter2D_), iters(iters_) {}\r
+        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type, int iters_) : \r
+          Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {}\r
 \r
         virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))\r
         {\r
@@ -415,7 +422,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, cons
 \r
     Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);\r
 \r
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, iterations));\r
+    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));\r
 }\r
 \r
 namespace\r
@@ -558,7 +565,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType,
 \r
     Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor);\r
 \r
-    return createFilter2D_GPU(linearFilter);\r
+    return createFilter2D_GPU(linearFilter, srcType, dstType);\r
 }\r
 \r
 void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor)\r
@@ -578,10 +585,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 namespace cv { namespace gpu { namespace filters\r
 {\r
     template <typename T, typename D>\r
-    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);\r
+    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
 \r
     template <typename T, typename D>\r
-    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);\r
+    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
 }}}\r
 \r
 namespace\r
@@ -589,7 +596,7 @@ namespace
     typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, \r
         const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);\r
 \r
-    typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);\r
+    typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);\r
 \r
     class NppLinearRowFilter : public BaseRowFilter_GPU\r
     {\r
@@ -614,35 +621,28 @@ namespace
     class GpuLinearRowFilter : public BaseRowFilter_GPU\r
     {\r
     public:\r
-        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : \r
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}\r
+        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : \r
+            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}\r
 \r
         virtual void operator()(const GpuMat& src, GpuMat& dst)\r
         {\r
-            func(src, dst, kernel.ptr<float>(), ksize, anchor);\r
+            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type);\r
         }\r
 \r
         Mat kernel;\r
         gpuFilter1D_t func;\r
+        int brd_type;\r
     };\r
 }\r
 \r
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor)\r
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)\r
 {\r
-    using namespace cv::gpu::filters;\r
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};\r
-    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =\r
-    {\r
-        {linearRowFilter_gpu<uchar4, uchar4>,linearRowFilter_gpu<uchar4, char4>,0,0,0,0},\r
-        {linearRowFilter_gpu<char4, uchar4>,linearRowFilter_gpu<char4, char4>,0,0,0,0},\r
-        {0,0,linearRowFilter_gpu<ushort2, ushort2>,linearRowFilter_gpu<ushort2, short2>,0,0},\r
-        {0,0,linearRowFilter_gpu<short2, ushort2>,linearRowFilter_gpu<short2, short2>,0,0},\r
-        {0,0,0,0,linearRowFilter_gpu<int, int>, linearRowFilter_gpu<int, float>},\r
-        {0,0,0,0,linearRowFilter_gpu<float, int>, linearRowFilter_gpu<float, float>}\r
-    };\r
     \r
     if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))\r
     {\r
+        CV_Assert(borderType == BORDER_CONSTANT);\r
+\r
         GpuMat gpu_row_krnl;\r
         int nDivisor;\r
         normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);\r
@@ -653,9 +653,15 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
         return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,\r
             nppFilter1D_callers[CV_MAT_CN(srcType)]));\r
     }\r
+    \r
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);\r
+    int gpuBorderType;\r
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));\r
 \r
-    CV_Assert(srcType == CV_8UC4 || srcType == CV_8SC4 || srcType == CV_16UC2 || srcType == CV_16SC2 || srcType == CV_32SC1 || srcType == CV_32FC1);\r
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);\r
+    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2 \r
+        || srcType == CV_32SC1 || srcType == CV_32FC1);\r
+\r
+    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));\r
 \r
     Mat temp(rowKernel.size(), CV_32FC1);\r
     rowKernel.convertTo(temp, CV_32FC1);\r
@@ -663,12 +669,35 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
 \r
     int ksize = cont_krnl.cols;\r
 \r
-    CV_Assert(ksize < 16);\r
+    CV_Assert(ksize > 0 && ksize <= 16);\r
 \r
     normalizeAnchor(anchor, ksize);\r
 \r
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, \r
-        gpuFilter1D_callers[CV_MAT_DEPTH(srcType)][CV_MAT_DEPTH(bufType)]));\r
+    gpuFilter1D_t func = 0;\r
+\r
+    switch (srcType)\r
+    {\r
+    case CV_8UC1:\r
+        func = filters::linearRowFilter_gpu<uchar, float>;\r
+        break;\r
+    case CV_8UC4:\r
+        func = filters::linearRowFilter_gpu<uchar4, float4>;\r
+        break;\r
+    case CV_16SC1:\r
+        func = filters::linearRowFilter_gpu<short, float>;\r
+        break;\r
+    case CV_16SC2:\r
+        func = filters::linearRowFilter_gpu<short2, float2>;\r
+        break;\r
+    case CV_32SC1:\r
+        func = filters::linearRowFilter_gpu<int, float>;\r
+        break;\r
+    case CV_32FC1:\r
+        func = filters::linearRowFilter_gpu<float, float>;\r
+        break;\r
+    }\r
+\r
+    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, func, gpuBorderType));\r
 }\r
 \r
 namespace\r
@@ -696,35 +725,28 @@ namespace
     class GpuLinearColumnFilter : public BaseColumnFilter_GPU\r
     {\r
     public:\r
-        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : \r
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}\r
+        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : \r
+            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}\r
 \r
         virtual void operator()(const GpuMat& src, GpuMat& dst)\r
         {\r
-            func(src, dst, kernel.ptr<float>(), ksize, anchor);\r
+            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type);\r
         }\r
 \r
         Mat kernel;\r
         gpuFilter1D_t func;\r
+        int brd_type;\r
     };\r
 }\r
 \r
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor)\r
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)\r
 {\r
-    using namespace cv::gpu::filters;\r
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};\r
-    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =\r
-    {\r
-        {linearColumnFilter_gpu<uchar4, uchar4>,linearColumnFilter_gpu<uchar4, char4>,0,0,0,0},\r
-        {linearColumnFilter_gpu<char4, uchar4>,linearColumnFilter_gpu<char4, char4>,0,0,0,0},\r
-        {0,0,linearColumnFilter_gpu<ushort2, ushort2>,linearColumnFilter_gpu<ushort2, short2>,0,0},\r
-        {0,0,linearColumnFilter_gpu<short2, ushort2>,linearColumnFilter_gpu<short2, short2>,0,0},\r
-        {0,0,0,0,linearColumnFilter_gpu<int, int>, linearColumnFilter_gpu<int, float>},\r
-        {0,0,0,0,linearColumnFilter_gpu<float, int>, linearColumnFilter_gpu<float, float>}\r
-    };\r
     \r
     if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))\r
     {\r
+        CV_Assert(borderType == BORDER_CONSTANT);\r
+\r
         GpuMat gpu_col_krnl;\r
         int nDivisor;\r
         normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);\r
@@ -735,9 +757,15 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
         return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, \r
             nppFilter1D_callers[CV_MAT_CN(bufType)]));\r
     }\r
+    \r
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);\r
+    int gpuBorderType;\r
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));\r
+   \r
+    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2 \r
+        || dstType == CV_32SC1 || dstType == CV_32FC1);\r
 \r
-    CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 || dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);\r
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);\r
+    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));\r
 \r
     Mat temp(columnKernel.size(), CV_32FC1);\r
     columnKernel.convertTo(temp, CV_32FC1);\r
@@ -745,50 +773,76 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
 \r
     int ksize = cont_krnl.cols;\r
 \r
-    CV_Assert(ksize < 16);\r
+    CV_Assert(ksize > 0 && ksize <= 16);\r
 \r
     normalizeAnchor(anchor, ksize);\r
 \r
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, \r
-        gpuFilter1D_callers[CV_MAT_DEPTH(bufType)][CV_MAT_DEPTH(dstType)]));\r
+    gpuFilter1D_t func = 0;\r
+\r
+    switch (dstType)\r
+    {\r
+    case CV_8UC1:\r
+        func = filters::linearColumnFilter_gpu<float, uchar>;\r
+        break;\r
+    case CV_8UC4:\r
+        func = filters::linearColumnFilter_gpu<float4, uchar4>;\r
+        break;\r
+    case CV_16SC1:\r
+        func = filters::linearColumnFilter_gpu<float, short>;\r
+        break;\r
+    case CV_16SC2:\r
+        func = filters::linearColumnFilter_gpu<float2, short2>;\r
+        break;\r
+    case CV_32SC1:\r
+        func = filters::linearColumnFilter_gpu<float, int>;\r
+        break;\r
+    case CV_32FC1:\r
+        func = filters::linearColumnFilter_gpu<float, float>;\r
+        break;\r
+    }\r
+\r
+    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType));\r
 }\r
 \r
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, \r
-    const Point& anchor)\r
+    const Point& anchor, int rowBorderType, int columnBorderType)\r
 {\r
+    if (columnBorderType < 0)\r
+        columnBorderType = rowBorderType;\r
+\r
     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);\r
     int cn = CV_MAT_CN(srcType);\r
-    int bdepth = std::max(sdepth, ddepth);\r
+    int bdepth = CV_32F;\r
     int bufType = CV_MAKETYPE(bdepth, cn);\r
 \r
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x);\r
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y);\r
+    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);\r
+    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);\r
 \r
-    return createSeparableFilter_GPU(rowFilter, columnFilter);\r
+    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);\r
 }\r
 \r
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor)\r
+void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType)\r
 {\r
     if( ddepth < 0 )\r
         ddepth = src.depth();\r
 \r
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));\r
 \r
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor);\r
-    f->apply(src, dst);\r
+    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);\r
+    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));\r
 }\r
 \r
 ////////////////////////////////////////////////////////////////////////////////////////////////////\r
 // Deriv Filter\r
 \r
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize)\r
+Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)\r
 {\r
     Mat kx, ky;\r
     getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);\r
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1));\r
+    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);\r
 }\r
 \r
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale)\r
+void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)\r
 {\r
     Mat kx, ky;\r
     getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);\r
@@ -803,10 +857,10 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;\r
     }\r
     \r
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));\r
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);\r
 }\r
 \r
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale)\r
+void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)\r
 {\r
     Mat kx, ky;\r
     getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);\r
@@ -821,7 +875,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;\r
     }\r
 \r
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));\r
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);\r
 }\r
 \r
 void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale)\r
@@ -843,7 +897,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d
 ////////////////////////////////////////////////////////////////////////////////////////////////////\r
 // Gaussian Filter\r
 \r
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2)\r
+Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)\r
 {        \r
     int depth = CV_MAT_DEPTH(type);\r
 \r
@@ -868,10 +922,10 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
     else\r
         ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );\r
 \r
-    return createSeparableLinearFilter_GPU(type, type, kx, ky);\r
+    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);\r
 }\r
 \r
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2)\r
+void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)\r
 {\r
     if (ksize.width == 1 && ksize.height == 1)\r
     {\r
@@ -881,8 +935,8 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si
 \r
     dst.create(src.size(), src.type());\r
     \r
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2);\r
-    f->apply(src, dst);\r
+    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);\r
+    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));\r
 }\r
 \r
 ////////////////////////////////////////////////////////////////////////////////////////////////////\r
index 361f11b..337af7a 100644 (file)
@@ -972,6 +972,12 @@ bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
         gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU;\r
         return true;\r
     }\r
+    \r
+    if (cpuBorderType == cv::BORDER_CONSTANT)\r
+    {\r
+        gpuBorderType = cv::gpu::BORDER_CONSTANT_GPU;\r
+        return true;\r
+    }\r
 \r
     return false;\r
 }\r
index d73853c..19e1e88 100644 (file)
@@ -123,278 +123,295 @@ namespace cv
             { \r
                 typedef uchar elem_t; \r
                 enum {cn=1};\r
-                static __device__ uchar all(uchar v) {return v;}\r
+                static __device__ __host__ uchar all(uchar v) {return v;}\r
+                static __device__ __host__ uchar make(uchar x) {return x;}\r
             };\r
             template<> struct VecTraits<uchar1> \r
             { \r
                 typedef uchar elem_t; \r
                 enum {cn=1};\r
-                static __device__ uchar1 all(uchar v) {return make_uchar1(v);}\r
+                static __device__ __host__ uchar1 all(uchar v) {return make_uchar1(v);}\r
+                static __device__ __host__ uchar1 make(uchar x) {return make_uchar1(x);}\r
             };\r
             template<> struct VecTraits<uchar2> \r
             { \r
                 typedef uchar elem_t; \r
                 enum {cn=2}; \r
-                static __device__ uchar2 all(uchar v) {return make_uchar2(v, v);}\r
+                static __device__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);}\r
+                static __device__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);}\r
             };\r
             template<> struct VecTraits<uchar3> \r
             { \r
                 typedef uchar elem_t; \r
                 enum {cn=3}; \r
-                static __device__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}\r
+                static __device__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}\r
+                static __device__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);}\r
             };\r
             template<> struct VecTraits<uchar4> \r
             { \r
                 typedef uchar elem_t; \r
                 enum {cn=4}; \r
-                static __device__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}\r
+                static __device__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}\r
+                static __device__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);}\r
             };\r
 \r
             template<> struct VecTraits<char> \r
             { \r
                 typedef char elem_t; \r
                 enum {cn=1}; \r
-                static __device__ char all(char v) {return v;}\r
+                static __device__ __host__ char all(char v) {return v;}\r
+                static __device__ __host__ char make(char x) {return x;}\r
             };\r
             template<> struct VecTraits<char1> \r
             { \r
                 typedef char elem_t; \r
                 enum {cn=1}; \r
-                static __device__ char1 all(char v) {return make_char1(v);}\r
+                static __device__ __host__ char1 all(char v) {return make_char1(v);}\r
+                static __device__ __host__ char1 make(char x) {return make_char1(x);}\r
             };\r
             template<> struct VecTraits<char2> \r
             { \r
                 typedef char elem_t; \r
                 enum {cn=2}; \r
-                static  __device__ char2 all(char v) {return make_char2(v, v);}\r
+                static  __device__ __host__ char2 all(char v) {return make_char2(v, v);}\r
+                static  __device__ __host__ char2 make(char x, char y) {return make_char2(x, y);}\r
             };\r
             template<> struct VecTraits<char3> \r
             { \r
                 typedef char elem_t; \r
                 enum {cn=3}; \r
-                static __device__ char3 all(char v) {return make_char3(v, v, v);}\r
+                static __device__ __host__ char3 all(char v) {return make_char3(v, v, v);}\r
+                static __device__ __host__ char3 make(char x, char y, char z) {return make_char3(x, y, z);}\r
             };\r
             template<> struct VecTraits<char4> \r
             { \r
                 typedef char elem_t; \r
                 enum {cn=4}; \r
-                static __device__ char4 all(char v) {return make_char4(v, v, v, v);}\r
+                static __device__ __host__ char4 all(char v) {return make_char4(v, v, v, v);}\r
+                static __device__ __host__ char4 make(char x, char y, char z, char w) {return make_char4(x, y, z, w);}\r
             };\r
 \r
             template<> struct VecTraits<ushort> \r
             { \r
                 typedef ushort elem_t; \r
                 enum {cn=1}; \r
-                static __device__ ushort all(ushort v) {return v;}\r
+                static __device__ __host__ ushort all(ushort v) {return v;}\r
+                static __device__ __host__ ushort make(ushort x) {return x;}\r
             };\r
             template<> struct VecTraits<ushort1> \r
             { \r
                 typedef ushort elem_t; \r
                 enum {cn=1}; \r
-                static __device__ ushort1 all(ushort v) {return make_ushort1(v);}\r
+                static __device__ __host__ ushort1 all(ushort v) {return make_ushort1(v);}\r
+                static __device__ __host__ ushort1 make(ushort x) {return make_ushort1(x);}\r
             };\r
             template<> struct VecTraits<ushort2> \r
             { \r
                 typedef ushort elem_t; \r
                 enum {cn=2}; \r
-                static __device__ ushort2 all(ushort v) {return make_ushort2(v, v);}\r
+                static __device__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);}\r
+                static __device__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);}\r
             };\r
             template<> struct VecTraits<ushort3> \r
             { \r
                 typedef ushort elem_t; \r
                 enum {cn=3}; \r
-                static __device__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}\r
+                static __device__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}\r
+                static __device__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);}\r
             };\r
             template<> struct VecTraits<ushort4> \r
             { \r
                 typedef ushort elem_t; \r
                 enum {cn=4}; \r
-                static __device__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}\r
+                static __device__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}\r
+                static __device__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);}\r
             };\r
 \r
             template<> struct VecTraits<short> \r
             { \r
                 typedef short elem_t; \r
                 enum {cn=1}; \r
-                static __device__ short all(short v) {return v;}\r
+                static __device__ __host__ short all(short v) {return v;}\r
+                static __device__ __host__ short make(short x) {return x;}\r
             };\r
             template<> struct VecTraits<short1> \r
             { \r
                 typedef short elem_t; \r
                 enum {cn=1}; \r
-                static __device__ short1 all(short v) {return make_short1(v);}\r
+                static __device__ __host__ short1 all(short v) {return make_short1(v);}\r
+                static __device__ __host__ short1 make(short x) {return make_short1(x);}\r
             };\r
             template<> struct VecTraits<short2> \r
             { \r
                 typedef short elem_t; \r
                 enum {cn=2}; \r
-                static __device__ short2 all(short v) {return make_short2(v, v);}\r
+                static __device__ __host__ short2 all(short v) {return make_short2(v, v);}\r
+                static __device__ __host__ short2 make(short x, short y) {return make_short2(x, y);}\r
             };\r
             template<> struct VecTraits<short3> \r
             { \r
                 typedef short elem_t; \r
                 enum {cn=3}; \r
-                static __device__ short3 all(short v) {return make_short3(v, v, v);}\r
+                static __device__ __host__ short3 all(short v) {return make_short3(v, v, v);}\r
+                static __device__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);}\r
             };\r
             template<> struct VecTraits<short4> \r
             { \r
                 typedef short elem_t; \r
                 enum {cn=4}; \r
-                static __device__ short4 all(short v) {return make_short4(v, v, v, v);}\r
+                static __device__ __host__ short4 all(short v) {return make_short4(v, v, v, v);}\r
+                static __device__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);}\r
             };\r
 \r
             template<> struct VecTraits<uint> \r
             { \r
                 typedef uint elem_t; \r
                 enum {cn=1}; \r
-                static __device__ uint all(uint v) {return v;}\r
+                static __device__ __host__ uint all(uint v) {return v;}\r
+                static __device__ __host__ uint make(uint x) {return x;}\r
             };\r
             template<> struct VecTraits<uint1> \r
             { \r
                 typedef uint elem_t; \r
                 enum {cn=1}; \r
-                static __device__ uint1 all(uint v) {return make_uint1(v);}\r
+                static __device__ __host__ uint1 all(uint v) {return make_uint1(v);}\r
+                static __device__ __host__ uint1 make(uint x) {return make_uint1(x);}\r
             };\r
             template<> struct VecTraits<uint2> \r
             { \r
                 typedef uint elem_t; \r
                 enum {cn=2}; \r
-                static __device__ uint2 all(uint v) {return make_uint2(v, v);}\r
+                static __device__ __host__ uint2 all(uint v) {return make_uint2(v, v);}\r
+                static __device__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);}\r
             };\r
             template<> struct VecTraits<uint3> \r
             { \r
                 typedef uint elem_t; \r
                 enum {cn=3}; \r
-                static __device__ uint3 all(uint v) {return make_uint3(v, v, v);}\r
+                static __device__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);}\r
+                static __device__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);}\r
             };\r
             template<> struct VecTraits<uint4> \r
             { \r
                 typedef uint elem_t; \r
                 enum {cn=4}; \r
-                static __device__ uint4 all(uint v) {return make_uint4(v, v, v, v);}\r
+                static __device__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);}\r
+                static __device__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);}\r
             };\r
 \r
             template<> struct VecTraits<int> \r
             { \r
                 typedef int elem_t; \r
                 enum {cn=1}; \r
-                static __device__ int all(int v) {return v;}\r
+                static __device__ __host__ int all(int v) {return v;}\r
+                static __device__ __host__ int make(int x) {return x;}\r
             };\r
             template<> struct VecTraits<int1> \r
             { \r
                 typedef int elem_t; \r
                 enum {cn=1}; \r
-                static __device__ int1 all(int v) {return make_int1(v);}\r
+                static __device__ __host__ int1 all(int v) {return make_int1(v);}\r
+                static __device__ __host__ int1 make(int x) {return make_int1(x);}\r
             };\r
             template<> struct VecTraits<int2> \r
             { \r
                 typedef int elem_t; \r
                 enum {cn=2}; \r
-                static __device__ int2 all(int v) {return make_int2(v, v);}\r
+                static __device__ __host__ int2 all(int v) {return make_int2(v, v);}\r
+                static __device__ __host__ int2 make(int x, int y) {return make_int2(x, y);}\r
             };\r
             template<> struct VecTraits<int3> \r
             { \r
                 typedef int elem_t; \r
                 enum {cn=3}; \r
-                static __device__ int3 all(int v) {return make_int3(v, v, v);}\r
+                static __device__ __host__ int3 all(int v) {return make_int3(v, v, v);}\r
+                static __device__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);}\r
             };\r
             template<> struct VecTraits<int4> \r
             { \r
                 typedef int elem_t; \r
                 enum {cn=4}; \r
-                static __device__ int4 all(int v) {return make_int4(v, v, v, v);}\r
+                static __device__ __host__ int4 all(int v) {return make_int4(v, v, v, v);}\r
+                static __device__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);}\r
             };\r
 \r
             template<> struct VecTraits<float> \r
             { \r
                 typedef float elem_t; \r
                 enum {cn=1}; \r
-                static __device__ float all(float v) {return v;}\r
+                static __device__ __host__ float all(float v) {return v;}\r
+                static __device__ __host__ float make(float x) {return x;}\r
             };\r
             template<> struct VecTraits<float1> \r
             { \r
                 typedef float elem_t; \r
                 enum {cn=1}; \r
-                static __device__ float1 all(float v) {return make_float1(v);}\r
+                static __device__ __host__ float1 all(float v) {return make_float1(v);}\r
+                static __device__ __host__ float1 make(float x) {return make_float1(x);}\r
             };\r
             template<> struct VecTraits<float2> \r
             { \r
                 typedef float elem_t; \r
                 enum {cn=2}; \r
-                static __device__ float2 all(float v) {return make_float2(v, v);}\r
+                static __device__ __host__ float2 all(float v) {return make_float2(v, v);}\r
+                static __device__ __host__ float2 make(float x, float y) {return make_float2(x, y);}\r
             };\r
             template<> struct VecTraits<float3> \r
             { \r
                 typedef float elem_t; \r
                 enum {cn=3}; \r
-                static __device__ float3 all(float v) {return make_float3(v, v, v);}\r
+                static __device__ __host__ float3 all(float v) {return make_float3(v, v, v);}\r
+                static __device__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);}\r
             };\r
             template<> struct VecTraits<float4> \r
             { \r
                 typedef float elem_t;\r
                 enum {cn=4}; \r
-                static __device__ float4 all(float v) {return make_float4(v, v, v, v);}\r
+                static __device__ __host__ float4 all(float v) {return make_float4(v, v, v, v);}\r
+                static __device__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);}\r
             };\r
 \r
             template <int cn, typename VecD> struct SatCast;\r
             template <typename VecD> struct SatCast<1, VecD>\r
             {\r
                 template <typename VecS>\r
-                __device__ VecD operator()(const VecS& v)\r
+                static __device__ VecD cast(const VecS& v)\r
                 {\r
-                    VecD res; \r
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);\r
-                    return res;\r
+                    typedef typename VecTraits<VecD>::elem_t D;\r
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x));\r
                 }\r
             };\r
             template <typename VecD> struct SatCast<2, VecD>\r
             {\r
                 template <typename VecS>\r
-                __device__ VecD operator()(const VecS& v)\r
+                static __device__ VecD cast(const VecS& v)\r
                 {\r
-                    VecD res; \r
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);\r
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);\r
-                    return res;\r
+                    typedef typename VecTraits<VecD>::elem_t D;\r
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));\r
                 }\r
             };\r
             template <typename VecD> struct SatCast<3, VecD>\r
             {\r
                 template <typename VecS>\r
-                __device__ VecD operator()(const VecS& v)\r
+                static __device__ VecD cast(const VecS& v)\r
                 {\r
-                    VecD res; \r
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);\r
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);\r
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.z);\r
-                    return res;\r
+                    typedef typename VecTraits<VecD>::elem_t D;\r
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));\r
                 }\r
             };\r
             template <typename VecD> struct SatCast<4, VecD>\r
             {\r
                 template <typename VecS>\r
-                __device__ VecD operator()(const VecS& v)\r
+                static __device__ VecD cast(const VecS& v)\r
                 {\r
-                    VecD res; \r
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);\r
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);\r
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.z);\r
-                    res.w = saturate_cast< VecTraits<VecD>::elem_t >(v.w);\r
-                    return res;\r
+                    typedef typename VecTraits<VecD>::elem_t D;\r
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));\r
                 }\r
             };\r
 \r
             template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)\r
             {\r
-                SatCast<\r
-                    \r
-                    VecTraits<VecD>::cn, \r
-                    \r
-                    VecD\r
-                > \r
-                \r
-                cast;\r
-                return cast(v);\r
+                return SatCast<VecTraits<VecD>::cn, VecD>::cast(v);\r
             }\r
 \r
             template<typename _Tp> static __device__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);}\r
index e38032b..c7c08e9 100644 (file)
@@ -107,7 +107,7 @@ protected:
 \r
             if (!compareMatches(matchesCPU, matchesGPU))\r
             {\r
-                ts->printf(CvTS::LOG, "Match FAIL");\r
+                ts->printf(CvTS::LOG, "Match FAIL\n");\r
                 ts->set_failed_test_info(CvTS::FAIL_MISMATCH);\r
                 return;\r
             }\r
@@ -119,7 +119,7 @@ protected:
 \r
             if (!compareMatches(knnMatchesCPU, knnMatchesGPU))\r
             {\r
-                ts->printf(CvTS::LOG, "KNN Match FAIL");\r
+                ts->printf(CvTS::LOG, "KNN Match FAIL\n");\r
                 ts->set_failed_test_info(CvTS::FAIL_MISMATCH);\r
                 return;\r
             }\r
@@ -131,7 +131,7 @@ protected:
 \r
             if (!compareMatches(radiusMatchesCPU, radiusMatchesGPU))\r
             {\r
-                ts->printf(CvTS::LOG, "Radius Match FAIL");\r
+                ts->printf(CvTS::LOG, "Radius Match FAIL\n");\r
                 ts->set_failed_test_info(CvTS::FAIL_MISMATCH);\r
                 return;\r
             }\r
index 47a6f50..67b9603 100644 (file)
@@ -80,7 +80,8 @@ protected:
 \r
         double res = norm(m1ROI, m2ROI, NORM_INF);\r
 \r
-        if (res <= 1)\r
+        // Max difference (2.0) in GaussianBlur\r
+        if (res <= 2)\r
             return CvTS::OK;\r
         \r
         ts->printf(CvTS::LOG, "Norm: %f\n", res);\r
@@ -166,8 +167,6 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest
 \r
     int test(const Mat& img)\r
     {\r
-        if (img.type() != CV_8UC1)\r
-            return CvTS::OK;\r
         int ksizes[] = {3, 5, 7};\r
         int ksizes_num = sizeof(ksizes) / sizeof(int);\r
 \r
@@ -183,10 +182,8 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest
             cv::Sobel(img, cpudst, -1, dx, dy, ksizes[i]);\r
 \r
             GpuMat gpu1(img);\r
-            gpu1.convertTo(gpu1, CV_32S);\r
             GpuMat gpudst;\r
             cv::gpu::Sobel(gpu1, gpudst, -1, dx, dy, ksizes[i]);\r
-            gpudst.convertTo(gpudst, CV_8U);\r
 \r
             if (CheckNorm(cpudst, gpudst, Size(ksizes[i], ksizes[i])) != CvTS::OK)\r
                 test_res = CvTS::FAIL_GENERIC;\r
@@ -204,20 +201,15 @@ struct CV_GpuNppImageScharrTest : public CV_GpuNppFilterTest
 \r
     int test(const Mat& img)\r
     {\r
-        if (img.type() != CV_8UC1)\r
-            return CvTS::OK;\r
-\r
         int dx = 1, dy = 0;\r
 \r
         Mat cpudst;\r
         cv::Scharr(img, cpudst, -1, dx, dy);\r
 \r
         GpuMat gpu1(img);\r
-        gpu1.convertTo(gpu1, CV_32S);\r
         GpuMat gpudst;\r
         cv::gpu::Scharr(gpu1, gpudst, -1, dx, dy);\r
-        gpudst.convertTo(gpudst, CV_8U);\r
-        \r
+                \r
         return CheckNorm(cpudst, gpudst, Size(3, 3));\r
     }\r
 };\r
@@ -244,7 +236,7 @@ struct CV_GpuNppImageGaussianBlurTest : public CV_GpuNppFilterTest
             {\r
                 cv::Size ksize(ksizes[i], ksizes[j]);\r
 \r
-                ts->printf(CvTS::LOG, "ksize = (%dx%d)\t", ksizes[i], ksizes[j]);\r
+                ts->printf(CvTS::LOG, "ksize = (%dx%d)\t\n", ksizes[i], ksizes[j]);\r
 \r
                 Mat cpudst;\r
                 cv::GaussianBlur(img, cpudst, ksize, sigma1);\r