From 49ec8ba742d7ec7db96f9b40c4cf5a3b10cdfc0e Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 13 Dec 2010 08:43:04 +0000
Subject: [PATCH] fixed bug in gpu filter engine (incorrect buffer type) and in
 vector's saturate_cast. changed buffer type in linear filters to float. added
 support of 1 channel image to linear filters. added support of
 BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border type to gpu
 linear filters. minor fix in tests. update comments in gpu.hpp.

---
 modules/gpu/include/opencv2/gpu/gpu.hpp        |  50 ++-
 modules/gpu/src/arithm.cpp                     |   4 +-
 modules/gpu/src/cuda/filters.cu                | 509 +++++++++++++++++++------
 modules/gpu/src/cuda/internal_shared.hpp       |   3 +-
 modules/gpu/src/filtering.cpp                  | 242 +++++++-----
 modules/gpu/src/imgproc_gpu.cpp                |   6 +
 modules/gpu/src/opencv2/gpu/device/vecmath.hpp | 149 ++++----
 tests/gpu/src/brute_force_matcher.cpp          |   6 +-
 tests/gpu/src/filters.cpp                      |  16 +-
 9 files changed, 676 insertions(+), 309 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 87c7c68..dafa5e1 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -388,7 +388,7 @@ namespace cv
         CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c);
 
         //! transposes the matrix
-        //! supports only CV_8UC1 type
+        //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type
         CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst);
 
         //! computes element-wise absolute difference of two arrays (c = abs(a - b))
@@ -725,11 +725,11 @@ namespace cv
         };
 
         //! returns the non-separable filter engine with the specified filter
-        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);
+        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D, int srcType, int dstType);
 
         //! returns the separable filter engine with the specified filters
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
-            const Ptr<BaseColumnFilter_GPU>& columnFilter);
+            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
 
         //! returns horizontal 1D box filter
         //! supports only CV_8UC1 source type and CV_32FC1 sum type
@@ -767,23 +767,40 @@ namespace cv
         CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, 
             const Point& anchor = Point(-1,-1));
 
-        //! returns the primitive row filter with the specified kernel
+        //! returns the primitive row filter with the specified kernel.
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
+        //! there are two version of algorithm: NPP and OpenCV.
+        //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
+        //! otherwise calls OpenCV version.
+        //! NPP supports only BORDER_CONSTANT border type.
+        //! OpenCV version supports only CV_32F as buffer depth and 
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
         CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, 
-            int anchor = -1);
-
-        //! returns the primitive column filter with the specified kernel
+            int anchor = -1, int borderType = BORDER_CONSTANT);
+
+        //! returns the primitive column filter with the specified kernel.
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
+        //! there are two version of algorithm: NPP and OpenCV.
+        //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
+        //! otherwise calls OpenCV version.
+        //! NPP supports only BORDER_CONSTANT border type.
+        //! OpenCV version supports only CV_32F as buffer depth and 
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
         CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, 
-            int anchor = -1);
+            int anchor = -1, int borderType = BORDER_CONSTANT);
 
         //! returns the separable linear filter engine
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, 
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1));
+            const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
+            int columnBorderType = -1);
 
         //! returns filter engine for the generalized Sobel operator
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize);
+        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, 
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
         //! returns the Gaussian filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0);
+        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, 
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
         //! returns maximum filter
         CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
@@ -812,16 +829,19 @@ namespace cv
 
         //! applies separable 2D linear filter to the image
         CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, 
-            Point anchor = Point(-1,-1));
+            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
         //! applies generalized Sobel operator to the image
-        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1);
+        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, 
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
         //! applies the vertical or horizontal Scharr operator to the image
-        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1);
+        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, 
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
         //! smooths the image using Gaussian filter.
-        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0);
+        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, 
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
         //! applies Laplacian operator to the image
         //! supports only ksize = 1 and ksize = 3
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 90edea1..3dcae2c 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -277,12 +277,12 @@ namespace cv { namespace gpu { namespace mathfunc
 
 void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
 {
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 
+    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8SC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 
         || src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1);
 
     dst.create( src.cols, src.rows, src.type() );
 
-    if (src.type() == CV_8UC1)
+    if (src.type() == CV_8UC1 || src.type() == CV_8SC1)
     {
         NppiSize sz;
         sz.width  = src.cols;
diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu
index d725d0d..035bab5 100644
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -43,6 +43,7 @@
 #include "opencv2/gpu/devmem2d.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/limits_gpu.hpp"
 
 #include "safe_call.hpp"
 #include "internal_shared.hpp"
@@ -50,14 +51,198 @@
 using namespace cv::gpu;
 using namespace cv::gpu::device;
 
-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+30F
-#endif
+namespace cv 
+{ 
+    namespace gpu 
+    {
+        namespace device
+        {
+            struct BrdReflect101 
+            {
+                explicit BrdReflect101(int len): last(len - 1) {}
+
+                __device__ int idx_low(int i) const
+                {
+                    return abs(i);
+                }
+
+                __device__ int idx_high(int i) const 
+                {
+                    return last - abs(last - i);
+                }
+
+                __device__ int idx(int i) const
+                {
+                    return abs(idx_high(i));
+                }
+
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return -last <= mini && maxi <= 2 * last;
+                }
+
+                int last;
+            };
+            template <typename D>
+            struct BrdRowReflect101: BrdReflect101
+            {
+                explicit BrdRowReflect101(int len): BrdReflect101(len) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_low(i)]);
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_high(i)]);
+                }
+            };
+            template <typename D>
+            struct BrdColReflect101: BrdReflect101
+            {
+                BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_low(i) * step]);
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_high(i) * step]);
+                }
+
+                int step;
+            };
+
+            struct BrdReplicate
+            {
+                explicit BrdReplicate(int len): last(len - 1) {}
+
+                __device__ int idx_low(int i) const
+                {
+                    return max(i, 0);
+                }
+
+                __device__ int idx_high(int i) const 
+                {
+                    return min(i, last);
+                }
+
+                __device__ int idx(int i) const
+                {
+                    return max(min(i, last), 0);
+                }
+
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return true;
+                }
+
+                int last;
+            };
+            template <typename D>
+            struct BrdRowReplicate: BrdReplicate
+            {
+                explicit BrdRowReplicate(int len): BrdReplicate(len) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_low(i)]);
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_high(i)]);
+                }
+            };
+            template <typename D>
+            struct BrdColReplicate: BrdReplicate
+            {
+                BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_low(i) * step]);
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return saturate_cast<D>(data[idx_high(i) * step]);
+                }
+                int step;
+            };
+
+            template <typename D>
+            struct BrdRowConstant
+            {
+                explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
+                {
+                    return i >= 0 ? saturate_cast<D>(data[i]) : val;
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return i < len ? saturate_cast<D>(data[i]) : val;
+                }
+
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return true;
+                }
+
+                int len;
+                D val;
+            };
+            template <typename D>
+            struct BrdColConstant
+            {
+                BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
+
+                template <typename T>
+                __device__ D at_low(int i, const T* data) const 
+                {
+                    return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
+                }
+
+                template <typename T>
+                __device__ D at_high(int i, const T* data) const 
+                {
+                    return i < len ? saturate_cast<D>(data[i * step]) : val;
+                }
+
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return true;
+                }
+
+                int len;
+                int step;
+                D val;
+            };
+        }
+    }
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear filters
 
 #define MAX_KERNEL_SIZE 16
+#define BLOCK_DIM_X 16
+#define BLOCK_DIM_Y 16
 
 namespace filter_krnls
 {
@@ -74,46 +259,53 @@ namespace cv { namespace gpu { namespace filters
 
 namespace filter_krnls
 {
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
-    __global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    template <typename T, size_t size> struct SmemType_
     {
-        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
-        
-        const int blockStartX = blockDim.x * blockIdx.x;
-        const int blockStartY = blockDim.y * blockIdx.y;
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t smem_t;
+    };
+    template <typename T> struct SmemType_<T, 4>
+    {
+        typedef T smem_t;
+    };
+    template <typename T> struct SmemType
+    {
+        typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t;
+    };
 
-		const int threadX = blockStartX + threadIdx.x;
-        const int prevThreadX = threadX - blockDim.x;
-        const int nextThreadX = threadX + blockDim.x;
+    template <int ksize, typename T, typename D, typename B>
+    __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
+    {
+        typedef typename SmemType<T>::smem_t smem_t;
 
-		const int threadY = blockStartY + threadIdx.y;
+        __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+        
+		const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
+		const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
 
-        T* sDataRow = smem + threadIdx.y * blockDim.x * 3;
+        smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
 
-        if (threadY < height)
+        if (y < src.rows)
         {
-            const T* rowSrc = src + threadY * src_step;
-
-            sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : VecTraits<T>::all(0);
+            const T* rowSrc = src.ptr(y);
 
-            sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : VecTraits<T>::all(0);
-
-            sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : VecTraits<T>::all(0);
+            sDataRow[threadIdx.x                  ] = b.at_low(x - BLOCK_DIM_X, rowSrc);
+            sDataRow[threadIdx.x + BLOCK_DIM_X    ] = b.at_high(x, rowSrc);
+            sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);
 
             __syncthreads();
 
-            if (threadX < width)
+            if (x < src.cols)
             {
                 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
                 sum_t sum = VecTraits<sum_t>::all(0);
 
-                sDataRow += threadIdx.x + blockDim.x - anchor;
+                sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
 
                 #pragma unroll
-                for(int i = 0; i < KERNEL_SIZE; ++i)
+                for(int i = 0; i < ksize; ++i)
                     sum = sum + sDataRow[i] * cLinearKernel[i];
 
-                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+                dst.ptr(y)[x] = saturate_cast<D>(sum);
             }
         }
     }
@@ -121,100 +313,138 @@ namespace filter_krnls
 
 namespace cv { namespace gpu { namespace filters
 {
-    template <int KERNEL_SIZE, typename T, typename D>
+    template <int ksize, typename T, typename D, template<typename> class B>
     void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
     {
-        const int BLOCK_DIM_X = 16;
-        const int BLOCK_DIM_Y = 16;
-
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
-        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
 
-        filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(), 
-            dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
+        typedef typename filter_krnls::SmemType<T>::smem_t smem_t;
+        B<smem_t> b(src.cols);
+
+        if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
+        {
+            cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, "
+                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
+        }
+
+        filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
 
         cudaSafeCall( cudaThreadSynchronize() );
     }
 
     template <typename T, typename D>
-    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
     {
         typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
-        static const caller_t callers[] = 
-        {linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, 
-         linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, 
-         linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, 
-         linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, 
-         linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, 
-         linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, 
-         linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, 
-         linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>};
-
+        static const caller_t callers[3][17] = 
+        {
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
+                linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
+                linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
+                linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<14, T, D, BrdRowReflect101>,
+                linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
+                linearRowFilter_caller<16, T, D, BrdRowReflect101>,
+            },            
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
+                linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
+                linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<14, T, D, BrdRowReplicate>,
+                linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
+                linearRowFilter_caller<16, T, D, BrdRowReplicate>,
+            },            
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<2 , T, D, BrdRowConstant>,
+                linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
+                linearRowFilter_caller<8 , T, D, BrdRowConstant>,
+                linearRowFilter_caller<9 , T, D, BrdRowConstant>,
+                linearRowFilter_caller<10, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<11, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<12, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<13, T, D, BrdRowConstant>,
+                linearRowFilter_caller<14, T, D, BrdRowConstant>,
+                linearRowFilter_caller<15, T, D, BrdRowConstant>, 
+                linearRowFilter_caller<16, T, D, BrdRowConstant>,
+            }
+        };
+        
         loadLinearKernel(kernel, ksize);
 
-        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
     }
 
-    template void linearRowFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-
-    template void linearRowFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-
-    template void linearRowFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearRowFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
+    template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);;
+    template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearRowFilter_gpu<int   , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
 }}}
 
 namespace filter_krnls
 {
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
-    __global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    template <int ksize, typename T, typename D, typename B>
+    __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
     {
         __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
         
-        const int blockStartX = blockDim.x * blockIdx.x;
-        const int blockStartY = blockDim.y * blockIdx.y;
-
-		const int threadX = blockStartX + threadIdx.x;
-
-		const int threadY = blockStartY + threadIdx.y;
-        const int prevThreadY = threadY - blockDim.y;
-        const int nextThreadY = threadY + blockDim.y;
-
-        const int smem_step = blockDim.x;
+		const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
+		const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
 
         T* sDataColumn = smem + threadIdx.x;
 
-        if (threadX < width)
+        if (x < src.cols)
         {
-            const T* colSrc = src + threadX;
+            const T* srcCol = src.ptr() + x;
 
-            sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : VecTraits<T>::all(0);
-
-            sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : VecTraits<T>::all(0);
-
-            sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : VecTraits<T>::all(0);
+            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol);
 
             __syncthreads();
 
-            if (threadY < height)
+            if (y < src.rows)
             {
                 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
                 sum_t sum = VecTraits<sum_t>::all(0);
 
-                sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step;
+                sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
 
                 #pragma unroll
-                for(int i = 0; i < KERNEL_SIZE; ++i)
-                    sum = sum + sDataColumn[i * smem_step] * cLinearKernel[i];
+                for(int i = 0; i < ksize; ++i)
+                    sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
 
-                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+                dst.ptr(y)[x] = saturate_cast<D>(sum);
             }
         }
     }
@@ -222,54 +452,101 @@ namespace filter_krnls
 
 namespace cv { namespace gpu { namespace filters
 {
-    template <int KERNEL_SIZE, typename T, typename D>
+    template <int ksize, typename T, typename D, template<typename> class B>
     void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
     {
-        const int BLOCK_DIM_X = 16;
-        const int BLOCK_DIM_Y = 16;
-
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
-        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+
+        B<T> b(src.rows, src.step / src.elemSize());
 
-        filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.data, src.step/src.elemSize(), 
-            dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
+        if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
+        {
+            cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
+                           "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
+        }
+
+        filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
 
         cudaSafeCall( cudaThreadSynchronize() );
     }
 
     template <typename T, typename D>
-    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type)
     {
         typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
-        static const caller_t callers[] = 
-        {linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, 
-         linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, 
-         linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, 
-         linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, 
-         linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, 
-         linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, 
-         linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, 
-         linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>};
-
+        static const caller_t callers[3][17] = 
+        {
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
+                linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<16, T, D, BrdColReflect101>, 
+            },            
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
+                linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<16, T, D, BrdColReplicate>, 
+            },            
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<2 , T, D, BrdColConstant>,
+                linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
+                linearColumnFilter_caller<10, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<11, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<12, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<13, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<14, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<15, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<16, T, D, BrdColConstant>, 
+            }
+        };
+        
         loadLinearKernel(kernel, ksize);
 
-        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
     }
 
-    template void linearColumnFilter_gpu<uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<uchar4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<char4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<char4, char4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-
-    template void linearColumnFilter_gpu<ushort2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<ushort2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<short2, ushort2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<short2, short2>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-
-    template void linearColumnFilter_gpu<int, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<int, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<float, int>(const DevMem2D&, const DevMem2D&, const float[], int , int);
-    template void linearColumnFilter_gpu<float, float>(const DevMem2D&, const DevMem2D&, const float[], int , int);
+    template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearColumnFilter_gpu<float , int   >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
+    template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
 }}}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -377,7 +654,7 @@ namespace bf_krnls
                     }
                 }
 
-                float minimum = FLT_MAX;
+                float minimum = numeric_limits_gpu<float>::max();
                 int id = 0;
 
                 if (cost[0] < minimum)
diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp
index e52ba4e..6bf060d 100644
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -59,7 +59,8 @@ namespace cv
         enum 
         {
             BORDER_REFLECT101_GPU = 0,
-            BORDER_REPLICATE_GPU
+            BORDER_REPLICATE_GPU,
+            BORDER_CONSTANT_GPU
         };
         
         // Converts CPU border extrapolation mode into GPU internal analogue.
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index b8dcaaf..b611e49 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -48,8 +48,8 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA)
 
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
@@ -58,11 +58,11 @@ Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 
@@ -71,10 +71,10 @@ void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog
 void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
 void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }
 void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point) { throw_nogpu(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double) { throw_nogpu(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); }
+void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); }
+void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); }
+void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); }
+void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); }
 void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double) { throw_nogpu(); }
 
 #else
@@ -133,13 +133,17 @@ namespace
     class Filter2DEngine_GPU : public FilterEngine_GPU
     {
     public:
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_) : filter2D(filter2D_) {}
+        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) : 
+            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
+        {}
 
         virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
         {
+            CV_Assert(src.type() == srcType);
+
             Size src_size = src.size();
 
-            dst.create(src_size, src.type());
+            dst.create(src_size, dstType);
             dst = Scalar(0.0);
 
             normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
@@ -151,12 +155,13 @@ namespace
         }
 
         Ptr<BaseFilter_GPU> filter2D;
+        int srcType, dstType;
     };
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D)
+Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D, int srcType, int dstType)
 {
-    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D));
+    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -168,8 +173,9 @@ namespace
     {
     public:
         SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, 
-                                       const Ptr<BaseColumnFilter_GPU>& columnFilter_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_)
+            const Ptr<BaseColumnFilter_GPU>& columnFilter_, int srcType_, int bufType_, int dstType_) :
+            rowFilter(rowFilter_), columnFilter(columnFilter_), 
+            srcType(srcType_), bufType(bufType_), dstType(dstType_)
         {
             ksize = Size(rowFilter->ksize, columnFilter->ksize);
             anchor = Point(rowFilter->anchor, columnFilter->anchor);
@@ -177,19 +183,20 @@ namespace
 
         virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
         {
+            CV_Assert(src.type() == srcType);
+
             Size src_size = src.size();
-            int src_type = src.type();
 
-            dst.create(src_size, src_type);
+            dst.create(src_size, dstType);
             dst = Scalar(0.0);
-            dstBuf.create(src_size, src_type);
+            dstBuf.create(src_size, bufType);
             dstBuf = Scalar(0.0);
 
             normalizeROI(roi, ksize, anchor, src_size);
 
-            srcROI = src(roi);
-            dstROI = dst(roi);
-            dstBufROI = dstBuf(roi);
+            GpuMat srcROI = src(roi);
+            GpuMat dstROI = dst(roi);
+            GpuMat dstBufROI = dstBuf(roi);
             
             (*rowFilter)(srcROI, dstBufROI);
             (*columnFilter)(dstBufROI, dstROI);
@@ -197,19 +204,19 @@ namespace
 
         Ptr<BaseRowFilter_GPU> rowFilter;
         Ptr<BaseColumnFilter_GPU> columnFilter;
+        int srcType, bufType, dstType;
+
         Size ksize;
         Point anchor;
-        GpuMat dstBuf;
-        GpuMat srcROI;
-        GpuMat dstROI;
-        GpuMat dstBufROI;
+
+        GpuMat dstBuf;        
     };
 }
 
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
-    const Ptr<BaseColumnFilter_GPU>& columnFilter)
+    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
 {
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));
+    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -315,7 +322,7 @@ Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Si
 Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)
 {
     Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);
-    return createFilter2D_GPU(boxFilter);
+    return createFilter2D_GPU(boxFilter, srcType, dstType);
 }
 
 void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor)
@@ -386,8 +393,8 @@ namespace
     class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU
     {
     public:
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int iters_) : 
-          Filter2DEngine_GPU(filter2D_), iters(iters_) {}
+        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type, int iters_) : 
+          Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {}
 
         virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
         {
@@ -415,7 +422,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, cons
 
     Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
 
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, iterations));
+    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
 }
 
 namespace
@@ -558,7 +565,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType,
 
     Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor);
 
-    return createFilter2D_GPU(linearFilter);
+    return createFilter2D_GPU(linearFilter, srcType, dstType);
 }
 
 void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor)
@@ -578,10 +585,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 namespace cv { namespace gpu { namespace filters
 {
     template <typename T, typename D>
-    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
 
     template <typename T, typename D>
-    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
 }}}
 
 namespace
@@ -589,7 +596,7 @@ namespace
     typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, 
         const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
 
-    typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);
 
     class NppLinearRowFilter : public BaseRowFilter_GPU
     {
@@ -614,35 +621,28 @@ namespace
     class GpuLinearRowFilter : public BaseRowFilter_GPU
     {
     public:
-        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : 
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
+        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : 
+            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
 
         virtual void operator()(const GpuMat& src, GpuMat& dst)
         {
-            func(src, dst, kernel.ptr<float>(), ksize, anchor);
+            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type);
         }
 
         Mat kernel;
         gpuFilter1D_t func;
+        int brd_type;
     };
 }
 
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor)
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
 {
-    using namespace cv::gpu::filters;
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
-    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
-    {
-        {linearRowFilter_gpu<uchar4, uchar4>,linearRowFilter_gpu<uchar4, char4>,0,0,0,0},
-        {linearRowFilter_gpu<char4, uchar4>,linearRowFilter_gpu<char4, char4>,0,0,0,0},
-        {0,0,linearRowFilter_gpu<ushort2, ushort2>,linearRowFilter_gpu<ushort2, short2>,0,0},
-        {0,0,linearRowFilter_gpu<short2, ushort2>,linearRowFilter_gpu<short2, short2>,0,0},
-        {0,0,0,0,linearRowFilter_gpu<int, int>, linearRowFilter_gpu<int, float>},
-        {0,0,0,0,linearRowFilter_gpu<float, int>, linearRowFilter_gpu<float, float>}
-    };
     
     if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
     {
+        CV_Assert(borderType == BORDER_CONSTANT);
+
         GpuMat gpu_row_krnl;
         int nDivisor;
         normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
@@ -653,9 +653,15 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
         return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,
             nppFilter1D_callers[CV_MAT_CN(srcType)]));
     }
+    
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
 
-    CV_Assert(srcType == CV_8UC4 || srcType == CV_8SC4 || srcType == CV_16UC2 || srcType == CV_16SC2 || srcType == CV_32SC1 || srcType == CV_32FC1);
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
+    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2 
+        || srcType == CV_32SC1 || srcType == CV_32FC1);
+
+    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));
 
     Mat temp(rowKernel.size(), CV_32FC1);
     rowKernel.convertTo(temp, CV_32FC1);
@@ -663,12 +669,35 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
 
     int ksize = cont_krnl.cols;
 
-    CV_Assert(ksize < 16);
+    CV_Assert(ksize > 0 && ksize <= 16);
 
     normalizeAnchor(anchor, ksize);
 
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, 
-        gpuFilter1D_callers[CV_MAT_DEPTH(srcType)][CV_MAT_DEPTH(bufType)]));
+    gpuFilter1D_t func = 0;
+
+    switch (srcType)
+    {
+    case CV_8UC1:
+        func = filters::linearRowFilter_gpu<uchar, float>;
+        break;
+    case CV_8UC4:
+        func = filters::linearRowFilter_gpu<uchar4, float4>;
+        break;
+    case CV_16SC1:
+        func = filters::linearRowFilter_gpu<short, float>;
+        break;
+    case CV_16SC2:
+        func = filters::linearRowFilter_gpu<short2, float2>;
+        break;
+    case CV_32SC1:
+        func = filters::linearRowFilter_gpu<int, float>;
+        break;
+    case CV_32FC1:
+        func = filters::linearRowFilter_gpu<float, float>;
+        break;
+    }
+
+    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, func, gpuBorderType));
 }
 
 namespace
@@ -696,35 +725,28 @@ namespace
     class GpuLinearColumnFilter : public BaseColumnFilter_GPU
     {
     public:
-        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : 
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
+        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : 
+            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
 
         virtual void operator()(const GpuMat& src, GpuMat& dst)
         {
-            func(src, dst, kernel.ptr<float>(), ksize, anchor);
+            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type);
         }
 
         Mat kernel;
         gpuFilter1D_t func;
+        int brd_type;
     };
 }
 
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor)
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
 {
-    using namespace cv::gpu::filters;
     static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
-    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
-    {
-        {linearColumnFilter_gpu<uchar4, uchar4>,linearColumnFilter_gpu<uchar4, char4>,0,0,0,0},
-        {linearColumnFilter_gpu<char4, uchar4>,linearColumnFilter_gpu<char4, char4>,0,0,0,0},
-        {0,0,linearColumnFilter_gpu<ushort2, ushort2>,linearColumnFilter_gpu<ushort2, short2>,0,0},
-        {0,0,linearColumnFilter_gpu<short2, ushort2>,linearColumnFilter_gpu<short2, short2>,0,0},
-        {0,0,0,0,linearColumnFilter_gpu<int, int>, linearColumnFilter_gpu<int, float>},
-        {0,0,0,0,linearColumnFilter_gpu<float, int>, linearColumnFilter_gpu<float, float>}
-    };
     
     if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
     {
+        CV_Assert(borderType == BORDER_CONSTANT);
+
         GpuMat gpu_col_krnl;
         int nDivisor;
         normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
@@ -735,9 +757,15 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
         return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, 
             nppFilter1D_callers[CV_MAT_CN(bufType)]));
     }
+    
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
+   
+    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2 
+        || dstType == CV_32SC1 || dstType == CV_32FC1);
 
-    CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 || dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 || bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
+    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
 
     Mat temp(columnKernel.size(), CV_32FC1);
     columnKernel.convertTo(temp, CV_32FC1);
@@ -745,50 +773,76 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
 
     int ksize = cont_krnl.cols;
 
-    CV_Assert(ksize < 16);
+    CV_Assert(ksize > 0 && ksize <= 16);
 
     normalizeAnchor(anchor, ksize);
 
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, 
-        gpuFilter1D_callers[CV_MAT_DEPTH(bufType)][CV_MAT_DEPTH(dstType)]));
+    gpuFilter1D_t func = 0;
+
+    switch (dstType)
+    {
+    case CV_8UC1:
+        func = filters::linearColumnFilter_gpu<float, uchar>;
+        break;
+    case CV_8UC4:
+        func = filters::linearColumnFilter_gpu<float4, uchar4>;
+        break;
+    case CV_16SC1:
+        func = filters::linearColumnFilter_gpu<float, short>;
+        break;
+    case CV_16SC2:
+        func = filters::linearColumnFilter_gpu<float2, short2>;
+        break;
+    case CV_32SC1:
+        func = filters::linearColumnFilter_gpu<float, int>;
+        break;
+    case CV_32FC1:
+        func = filters::linearColumnFilter_gpu<float, float>;
+        break;
+    }
+
+    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType));
 }
 
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, 
-    const Point& anchor)
+    const Point& anchor, int rowBorderType, int columnBorderType)
 {
+    if (columnBorderType < 0)
+        columnBorderType = rowBorderType;
+
     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
     int cn = CV_MAT_CN(srcType);
-    int bdepth = std::max(sdepth, ddepth);
+    int bdepth = CV_32F;
     int bufType = CV_MAKETYPE(bdepth, cn);
 
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y);
+    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
+    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
 
-    return createSeparableFilter_GPU(rowFilter, columnFilter);
+    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
 }
 
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor)
+void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType)
 {
     if( ddepth < 0 )
         ddepth = src.depth();
 
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor);
-    f->apply(src, dst);
+    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
+    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deriv Filter
 
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize)
+Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
 {
     Mat kx, ky;
     getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1));
+    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
 }
 
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale)
+void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
 {
     Mat kx, ky;
     getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
@@ -803,10 +857,10 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;
     }
     
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
 }
 
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale)
+void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
 {
     Mat kx, ky;
     getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
@@ -821,7 +875,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;
     }
 
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
 }
 
 void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale)
@@ -843,7 +897,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2)
+Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
 {        
     int depth = CV_MAT_DEPTH(type);
 
@@ -868,10 +922,10 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
     else
         ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
 
-    return createSeparableLinearFilter_GPU(type, type, kx, ky);
+    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
 }
 
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2)
+void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
 {
     if (ksize.width == 1 && ksize.height == 1)
     {
@@ -881,8 +935,8 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si
 
     dst.create(src.size(), src.type());
     
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2);
-    f->apply(src, dst);
+    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
+    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 361f11b..337af7a 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -972,6 +972,12 @@ bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
         gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU;
         return true;
     }
+    
+    if (cpuBorderType == cv::BORDER_CONSTANT)
+    {
+        gpuBorderType = cv::gpu::BORDER_CONSTANT_GPU;
+        return true;
+    }
 
     return false;
 }
diff --git a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
index d73853c..19e1e88 100644
--- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
@@ -123,278 +123,295 @@ namespace cv
             { 
                 typedef uchar elem_t; 
                 enum {cn=1};
-                static __device__ uchar all(uchar v) {return v;}
+                static __device__ __host__ uchar all(uchar v) {return v;}
+                static __device__ __host__ uchar make(uchar x) {return x;}
             };
             template<> struct VecTraits<uchar1> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=1};
-                static __device__ uchar1 all(uchar v) {return make_uchar1(v);}
+                static __device__ __host__ uchar1 all(uchar v) {return make_uchar1(v);}
+                static __device__ __host__ uchar1 make(uchar x) {return make_uchar1(x);}
             };
             template<> struct VecTraits<uchar2> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=2}; 
-                static __device__ uchar2 all(uchar v) {return make_uchar2(v, v);}
+                static __device__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);}
+                static __device__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);}
             };
             template<> struct VecTraits<uchar3> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=3}; 
-                static __device__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}
+                static __device__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}
+                static __device__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);}
             };
             template<> struct VecTraits<uchar4> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=4}; 
-                static __device__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}
+                static __device__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}
+                static __device__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);}
             };
 
             template<> struct VecTraits<char> 
             { 
                 typedef char elem_t; 
                 enum {cn=1}; 
-                static __device__ char all(char v) {return v;}
+                static __device__ __host__ char all(char v) {return v;}
+                static __device__ __host__ char make(char x) {return x;}
             };
             template<> struct VecTraits<char1> 
             { 
                 typedef char elem_t; 
                 enum {cn=1}; 
-                static __device__ char1 all(char v) {return make_char1(v);}
+                static __device__ __host__ char1 all(char v) {return make_char1(v);}
+                static __device__ __host__ char1 make(char x) {return make_char1(x);}
             };
             template<> struct VecTraits<char2> 
             { 
                 typedef char elem_t; 
                 enum {cn=2}; 
-                static  __device__ char2 all(char v) {return make_char2(v, v);}
+                static  __device__ __host__ char2 all(char v) {return make_char2(v, v);}
+                static  __device__ __host__ char2 make(char x, char y) {return make_char2(x, y);}
             };
             template<> struct VecTraits<char3> 
             { 
                 typedef char elem_t; 
                 enum {cn=3}; 
-                static __device__ char3 all(char v) {return make_char3(v, v, v);}
+                static __device__ __host__ char3 all(char v) {return make_char3(v, v, v);}
+                static __device__ __host__ char3 make(char x, char y, char z) {return make_char3(x, y, z);}
             };
             template<> struct VecTraits<char4> 
             { 
                 typedef char elem_t; 
                 enum {cn=4}; 
-                static __device__ char4 all(char v) {return make_char4(v, v, v, v);}
+                static __device__ __host__ char4 all(char v) {return make_char4(v, v, v, v);}
+                static __device__ __host__ char4 make(char x, char y, char z, char w) {return make_char4(x, y, z, w);}
             };
 
             template<> struct VecTraits<ushort> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=1}; 
-                static __device__ ushort all(ushort v) {return v;}
+                static __device__ __host__ ushort all(ushort v) {return v;}
+                static __device__ __host__ ushort make(ushort x) {return x;}
             };
             template<> struct VecTraits<ushort1> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=1}; 
-                static __device__ ushort1 all(ushort v) {return make_ushort1(v);}
+                static __device__ __host__ ushort1 all(ushort v) {return make_ushort1(v);}
+                static __device__ __host__ ushort1 make(ushort x) {return make_ushort1(x);}
             };
             template<> struct VecTraits<ushort2> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=2}; 
-                static __device__ ushort2 all(ushort v) {return make_ushort2(v, v);}
+                static __device__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);}
+                static __device__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);}
             };
             template<> struct VecTraits<ushort3> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=3}; 
-                static __device__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}
+                static __device__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}
+                static __device__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);}
             };
             template<> struct VecTraits<ushort4> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=4}; 
-                static __device__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}
+                static __device__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}
+                static __device__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);}
             };
 
             template<> struct VecTraits<short> 
             { 
                 typedef short elem_t; 
                 enum {cn=1}; 
-                static __device__ short all(short v) {return v;}
+                static __device__ __host__ short all(short v) {return v;}
+                static __device__ __host__ short make(short x) {return x;}
             };
             template<> struct VecTraits<short1> 
             { 
                 typedef short elem_t; 
                 enum {cn=1}; 
-                static __device__ short1 all(short v) {return make_short1(v);}
+                static __device__ __host__ short1 all(short v) {return make_short1(v);}
+                static __device__ __host__ short1 make(short x) {return make_short1(x);}
             };
             template<> struct VecTraits<short2> 
             { 
                 typedef short elem_t; 
                 enum {cn=2}; 
-                static __device__ short2 all(short v) {return make_short2(v, v);}
+                static __device__ __host__ short2 all(short v) {return make_short2(v, v);}
+                static __device__ __host__ short2 make(short x, short y) {return make_short2(x, y);}
             };
             template<> struct VecTraits<short3> 
             { 
                 typedef short elem_t; 
                 enum {cn=3}; 
-                static __device__ short3 all(short v) {return make_short3(v, v, v);}
+                static __device__ __host__ short3 all(short v) {return make_short3(v, v, v);}
+                static __device__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);}
             };
             template<> struct VecTraits<short4> 
             { 
                 typedef short elem_t; 
                 enum {cn=4}; 
-                static __device__ short4 all(short v) {return make_short4(v, v, v, v);}
+                static __device__ __host__ short4 all(short v) {return make_short4(v, v, v, v);}
+                static __device__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);}
             };
 
             template<> struct VecTraits<uint> 
             { 
                 typedef uint elem_t; 
                 enum {cn=1}; 
-                static __device__ uint all(uint v) {return v;}
+                static __device__ __host__ uint all(uint v) {return v;}
+                static __device__ __host__ uint make(uint x) {return x;}
             };
             template<> struct VecTraits<uint1> 
             { 
                 typedef uint elem_t; 
                 enum {cn=1}; 
-                static __device__ uint1 all(uint v) {return make_uint1(v);}
+                static __device__ __host__ uint1 all(uint v) {return make_uint1(v);}
+                static __device__ __host__ uint1 make(uint x) {return make_uint1(x);}
             };
             template<> struct VecTraits<uint2> 
             { 
                 typedef uint elem_t; 
                 enum {cn=2}; 
-                static __device__ uint2 all(uint v) {return make_uint2(v, v);}
+                static __device__ __host__ uint2 all(uint v) {return make_uint2(v, v);}
+                static __device__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);}
             };
             template<> struct VecTraits<uint3> 
             { 
                 typedef uint elem_t; 
                 enum {cn=3}; 
-                static __device__ uint3 all(uint v) {return make_uint3(v, v, v);}
+                static __device__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);}
+                static __device__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);}
             };
             template<> struct VecTraits<uint4> 
             { 
                 typedef uint elem_t; 
                 enum {cn=4}; 
-                static __device__ uint4 all(uint v) {return make_uint4(v, v, v, v);}
+                static __device__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);}
+                static __device__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);}
             };
 
             template<> struct VecTraits<int> 
             { 
                 typedef int elem_t; 
                 enum {cn=1}; 
-                static __device__ int all(int v) {return v;}
+                static __device__ __host__ int all(int v) {return v;}
+                static __device__ __host__ int make(int x) {return x;}
             };
             template<> struct VecTraits<int1> 
             { 
                 typedef int elem_t; 
                 enum {cn=1}; 
-                static __device__ int1 all(int v) {return make_int1(v);}
+                static __device__ __host__ int1 all(int v) {return make_int1(v);}
+                static __device__ __host__ int1 make(int x) {return make_int1(x);}
             };
             template<> struct VecTraits<int2> 
             { 
                 typedef int elem_t; 
                 enum {cn=2}; 
-                static __device__ int2 all(int v) {return make_int2(v, v);}
+                static __device__ __host__ int2 all(int v) {return make_int2(v, v);}
+                static __device__ __host__ int2 make(int x, int y) {return make_int2(x, y);}
             };
             template<> struct VecTraits<int3> 
             { 
                 typedef int elem_t; 
                 enum {cn=3}; 
-                static __device__ int3 all(int v) {return make_int3(v, v, v);}
+                static __device__ __host__ int3 all(int v) {return make_int3(v, v, v);}
+                static __device__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);}
             };
             template<> struct VecTraits<int4> 
             { 
                 typedef int elem_t; 
                 enum {cn=4}; 
-                static __device__ int4 all(int v) {return make_int4(v, v, v, v);}
+                static __device__ __host__ int4 all(int v) {return make_int4(v, v, v, v);}
+                static __device__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);}
             };
 
             template<> struct VecTraits<float> 
             { 
                 typedef float elem_t; 
                 enum {cn=1}; 
-                static __device__ float all(float v) {return v;}
+                static __device__ __host__ float all(float v) {return v;}
+                static __device__ __host__ float make(float x) {return x;}
             };
             template<> struct VecTraits<float1> 
             { 
                 typedef float elem_t; 
                 enum {cn=1}; 
-                static __device__ float1 all(float v) {return make_float1(v);}
+                static __device__ __host__ float1 all(float v) {return make_float1(v);}
+                static __device__ __host__ float1 make(float x) {return make_float1(x);}
             };
             template<> struct VecTraits<float2> 
             { 
                 typedef float elem_t; 
                 enum {cn=2}; 
-                static __device__ float2 all(float v) {return make_float2(v, v);}
+                static __device__ __host__ float2 all(float v) {return make_float2(v, v);}
+                static __device__ __host__ float2 make(float x, float y) {return make_float2(x, y);}
             };
             template<> struct VecTraits<float3> 
             { 
                 typedef float elem_t; 
                 enum {cn=3}; 
-                static __device__ float3 all(float v) {return make_float3(v, v, v);}
+                static __device__ __host__ float3 all(float v) {return make_float3(v, v, v);}
+                static __device__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);}
             };
             template<> struct VecTraits<float4> 
             { 
                 typedef float elem_t;
                 enum {cn=4}; 
-                static __device__ float4 all(float v) {return make_float4(v, v, v, v);}
+                static __device__ __host__ float4 all(float v) {return make_float4(v, v, v, v);}
+                static __device__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);}
             };
 
             template <int cn, typename VecD> struct SatCast;
             template <typename VecD> struct SatCast<1, VecD>
             {
                 template <typename VecS>
-                __device__ VecD operator()(const VecS& v)
+                static __device__ VecD cast(const VecS& v)
                 {
-                    VecD res; 
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
-                    return res;
+                    typedef typename VecTraits<VecD>::elem_t D;
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x));
                 }
             };
             template <typename VecD> struct SatCast<2, VecD>
             {
                 template <typename VecS>
-                __device__ VecD operator()(const VecS& v)
+                static __device__ VecD cast(const VecS& v)
                 {
-                    VecD res; 
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);
-                    return res;
+                    typedef typename VecTraits<VecD>::elem_t D;
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
                 }
             };
             template <typename VecD> struct SatCast<3, VecD>
             {
                 template <typename VecS>
-                __device__ VecD operator()(const VecS& v)
+                static __device__ VecD cast(const VecS& v)
                 {
-                    VecD res; 
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.z);
-                    return res;
+                    typedef typename VecTraits<VecD>::elem_t D;
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
                 }
             };
             template <typename VecD> struct SatCast<4, VecD>
             {
                 template <typename VecS>
-                __device__ VecD operator()(const VecS& v)
+                static __device__ VecD cast(const VecS& v)
                 {
-                    VecD res; 
-                    res.x = saturate_cast< VecTraits<VecD>::elem_t >(v.x);
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.y);
-                    res.y = saturate_cast< VecTraits<VecD>::elem_t >(v.z);
-                    res.w = saturate_cast< VecTraits<VecD>::elem_t >(v.w);
-                    return res;
+                    typedef typename VecTraits<VecD>::elem_t D;
+                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
                 }
             };
 
             template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
             {
-                SatCast<
-                    
-                    VecTraits<VecD>::cn, 
-                    
-                    VecD
-                > 
-                
-                cast;
-                return cast(v);
+                return SatCast<VecTraits<VecD>::cn, VecD>::cast(v);
             }
 
             template<typename _Tp> static __device__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);}
diff --git a/tests/gpu/src/brute_force_matcher.cpp b/tests/gpu/src/brute_force_matcher.cpp
index e38032b..c7c08e9 100644
--- a/tests/gpu/src/brute_force_matcher.cpp
+++ b/tests/gpu/src/brute_force_matcher.cpp
@@ -107,7 +107,7 @@ protected:
 
             if (!compareMatches(matchesCPU, matchesGPU))
             {
-                ts->printf(CvTS::LOG, "Match FAIL");
+                ts->printf(CvTS::LOG, "Match FAIL\n");
                 ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
                 return;
             }
@@ -119,7 +119,7 @@ protected:
 
             if (!compareMatches(knnMatchesCPU, knnMatchesGPU))
             {
-                ts->printf(CvTS::LOG, "KNN Match FAIL");
+                ts->printf(CvTS::LOG, "KNN Match FAIL\n");
                 ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
                 return;
             }
@@ -131,7 +131,7 @@ protected:
 
             if (!compareMatches(radiusMatchesCPU, radiusMatchesGPU))
             {
-                ts->printf(CvTS::LOG, "Radius Match FAIL");
+                ts->printf(CvTS::LOG, "Radius Match FAIL\n");
                 ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
                 return;
             }
diff --git a/tests/gpu/src/filters.cpp b/tests/gpu/src/filters.cpp
index 47a6f50..67b9603 100644
--- a/tests/gpu/src/filters.cpp
+++ b/tests/gpu/src/filters.cpp
@@ -80,7 +80,8 @@ protected:
 
         double res = norm(m1ROI, m2ROI, NORM_INF);
 
-        if (res <= 1)
+        // Max difference (2.0) in GaussianBlur
+        if (res <= 2)
             return CvTS::OK;
         
         ts->printf(CvTS::LOG, "Norm: %f\n", res);
@@ -166,8 +167,6 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest
 
     int test(const Mat& img)
     {
-        if (img.type() != CV_8UC1)
-            return CvTS::OK;
         int ksizes[] = {3, 5, 7};
         int ksizes_num = sizeof(ksizes) / sizeof(int);
 
@@ -183,10 +182,8 @@ struct CV_GpuNppImageSobelTest : public CV_GpuNppFilterTest
             cv::Sobel(img, cpudst, -1, dx, dy, ksizes[i]);
 
             GpuMat gpu1(img);
-            gpu1.convertTo(gpu1, CV_32S);
             GpuMat gpudst;
             cv::gpu::Sobel(gpu1, gpudst, -1, dx, dy, ksizes[i]);
-            gpudst.convertTo(gpudst, CV_8U);
 
             if (CheckNorm(cpudst, gpudst, Size(ksizes[i], ksizes[i])) != CvTS::OK)
                 test_res = CvTS::FAIL_GENERIC;
@@ -204,20 +201,15 @@ struct CV_GpuNppImageScharrTest : public CV_GpuNppFilterTest
 
     int test(const Mat& img)
     {
-        if (img.type() != CV_8UC1)
-            return CvTS::OK;
-
         int dx = 1, dy = 0;
 
         Mat cpudst;
         cv::Scharr(img, cpudst, -1, dx, dy);
 
         GpuMat gpu1(img);
-        gpu1.convertTo(gpu1, CV_32S);
         GpuMat gpudst;
         cv::gpu::Scharr(gpu1, gpudst, -1, dx, dy);
-        gpudst.convertTo(gpudst, CV_8U);
-        
+                
         return CheckNorm(cpudst, gpudst, Size(3, 3));
     }
 };
@@ -244,7 +236,7 @@ struct CV_GpuNppImageGaussianBlurTest : public CV_GpuNppFilterTest
             {
                 cv::Size ksize(ksizes[i], ksizes[j]);
 
-                ts->printf(CvTS::LOG, "ksize = (%dx%d)\t", ksizes[i], ksizes[j]);
+                ts->printf(CvTS::LOG, "ksize = (%dx%d)\t\n", ksizes[i], ksizes[j]);
 
                 Mat cpudst;
                 cv::GaussianBlur(img, cpudst, ksize, sigma1);
-- 
2.7.4