From ab04a03621695cdcaef9b531e2cc43db9f7ded21 Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Thu, 2 Dec 2010 09:07:13 +0000
Subject: [PATCH] added support of BORDER_REFLECT into gpu::cornerHarris and
 etc, added support of border extr. int linear filters

---
 modules/gpu/include/opencv2/gpu/gpu.hpp     |   6 +-
 modules/gpu/src/cuda/border_interpolate.hpp | 108 ++++++++++++
 modules/gpu/src/cuda/cuda_shared.hpp        |   8 +
 modules/gpu/src/cuda/hog.cu                 |   3 +-
 modules/gpu/src/cuda/imgproc.cu             | 112 ++++++++----
 modules/gpu/src/cuda/linear_filters_beta.cu | 264 ++++++++++++++++++++++++++++
 modules/gpu/src/imgproc_gpu.cpp             | 120 ++++++++++---
 tests/gpu/src/imgproc_gpu.cpp               |  34 ++--
 8 files changed, 577 insertions(+), 78 deletions(-)
 create mode 100644 modules/gpu/src/cuda/border_interpolate.hpp
 create mode 100644 modules/gpu/src/cuda/linear_filters_beta.cu

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 50dfc2a..fca5f66 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -628,13 +628,11 @@ namespace cv
         CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);
 
         //! computes Harris cornerness criteria at each image pixel 
-        // (does BORDER_CONSTANT interpolation with 0 as the fill value)
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
 
 
         //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
-        // (does BORDER_CONSTANT interpolation with 0 as the fill value)
-        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize=3);
+        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
 
 
         //////////////////////////////// Filter Engine ////////////////////////////////
diff --git a/modules/gpu/src/cuda/border_interpolate.hpp b/modules/gpu/src/cuda/border_interpolate.hpp
new file mode 100644
index 0000000..26b19cb
--- /dev/null
+++ b/modules/gpu/src/cuda/border_interpolate.hpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+namespace cv { namespace gpu {
+
+    struct BrdReflect101 
+    {
+        BrdReflect101(int len) : last(len - 1) {}
+
+        __device__ int idx_low(int i) const
+        {
+            return abs(i);
+        }
+
+        __device__ int idx_high(int i) const 
+        {
+            return last - abs(i - last);
+        }
+
+        __device__ int idx(int i) const
+        {
+            return i <= last ? idx_low(i) : idx_high(i);
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return -last <= mini && maxi <= 2 * last;
+        }
+
+        int last;
+    };
+
+
+    template <typename T>
+    struct BrdRowReflect101: BrdReflect101
+    {
+        BrdRowReflect101(int len) : BrdReflect101(len) {}
+
+        __device__ float at_low(int i, const T* data) const 
+        {
+            return data[idx_low(i)];
+        }
+
+        __device__ float at_high(int i, const T* data) const 
+        {
+            return data[idx_high(i)];
+        }
+    };
+
+
+    template <typename T>
+    struct BrdColReflect101: BrdReflect101
+    {
+        BrdColReflect101(int len, int step) : BrdReflect101(len), step(step) {}
+
+        __device__ float at_low(int i, const T* data) const 
+        {
+            return data[idx_low(i) * step];
+        }
+
+        __device__ float at_high(int i, const T* data) const 
+        {
+            return data[idx_high(i) * step];
+        }
+
+        int step;
+    };
+
+}}
\ No newline at end of file
diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp
index ca82059..ce0a704 100644
--- a/modules/gpu/src/cuda/cuda_shared.hpp
+++ b/modules/gpu/src/cuda/cuda_shared.hpp
@@ -94,6 +94,14 @@ namespace cv
             cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
             cudaSafeCall( cudaUnbindTexture(tex) );
         }        
+
+        // Available GPU border interpolation modes (named as CPU 
+        // border interpolation modes)
+        enum 
+        {
+            BORDER_REFLECT101 = 0,
+        };
+
     }
 }
 
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu
index b1fee4c..8c8144e 100644
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -322,7 +322,8 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
         normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
     else if (nthreads == 512)
         normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
-    // We don't support bigger sizes of the block histograms
+    else
+        cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);
 
     cudaSafeCall(cudaThreadSynchronize());
 }
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index ab57cf9..0f4fe41 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -41,6 +41,7 @@
 //M*/
 
 #include "cuda_shared.hpp"
+#include "border_interpolate.hpp"
 
 using namespace cv::gpu;
 
@@ -464,10 +465,40 @@ namespace cv { namespace gpu { namespace imgproc
         reprojectImageTo3D_caller(disp, xyzw, q, stream);
     }
 
+//////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////
+
+    __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, 
+                                          const PtrStepf Dy, PtrStepf dst)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x < cols && y < rows)
+        {            
+            float dx = Dx.ptr(y)[x];
+            float dy = Dy.ptr(y)[x];
+
+            dst.ptr(y)[x] = dx * dx;
+            dst.ptr(y + rows)[x] = dx * dy;
+            dst.ptr(y + (rows << 1))[x] = dy * dy;
+        }
+    }
+
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)
+    {
+        dim3 threads(32, 8);
+        dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
+
+        extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
 
+    template <typename B>
     __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
-                                        const PtrStep Dx, const PtrStep Dy, PtrStep dst)
+                                        const PtrStep Dx, const PtrStep Dy, PtrStep dst, B border_row, 
+                                        B border_col)
     {
         const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
         const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -478,22 +509,21 @@ namespace cv { namespace gpu { namespace imgproc
             float b = 0.f;
             float c = 0.f;
 
-            int offset1 = -(block_size / 2);
-            int offset2 = offset1 + block_size;
-
-            unsigned int j_begin = max(x + offset1, 0);
-            unsigned int i_begin = max(y + offset1, 0);
-            unsigned int j_end = min(x + offset2, cols);
-            unsigned int i_end = min(y + offset2, rows);
+            const int ibegin = y - (block_size / 2);
+            const int jbegin = x - (block_size / 2);
+            const int iend = ibegin + block_size;
+            const int jend = jbegin + block_size;
 
-            for (unsigned int i = i_begin; i < i_end; ++i)
+            for (int i = ibegin; i < iend; ++i)
             {
-                const float* dx_row = (const float*)Dx.ptr(i);
-                const float* dy_row = (const float*)Dy.ptr(i);
-                for (unsigned int j = j_begin; j < j_end; ++j)
+                int y = border_col.idx(i);
+                const float* dx_row = (const float*)Dx.ptr(y);
+                const float* dy_row = (const float*)Dy.ptr(y);
+                for (int j = jbegin; j < jend; ++j)
                 {
-                    float dx = dx_row[j];
-                    float dy = dy_row[j];
+                    int x = border_row.idx(j);
+                    float dx = dx_row[x];
+                    float dy = dy_row[x];
                     a += dx * dx;
                     b += dx * dy;
                     c += dy * dy;
@@ -504,7 +534,8 @@ namespace cv { namespace gpu { namespace imgproc
         }
     }
 
-    void cornerHarris_caller(const int block_size, const float k, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst)
+    void cornerHarris_caller(const int block_size, const float k, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst, 
+                             int border_type)
     {
         const int rows = Dx.rows;
         const int cols = Dx.cols;
@@ -512,14 +543,22 @@ namespace cv { namespace gpu { namespace imgproc
         dim3 threads(32, 8);
         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-        cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, Dx, Dy, dst);
+        switch (border_type) 
+        {
+        case BORDER_REFLECT101:
+            cornerHarris_kernel<<<grid, threads>>>(
+                    cols, rows, block_size, k, Dx, Dy, dst, 
+                    BrdReflect101(cols), BrdReflect101(rows));
+            break;
+        }
         cudaSafeCall(cudaThreadSynchronize());
     }
 
 /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
 
-    __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size,
-                                             const PtrStep Dx, const PtrStep Dy, PtrStep dst)
+    template <typename B>
+    __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, const PtrStep Dx, 
+                                             const PtrStep Dy, PtrStep dst, B border_row, B border_col)
     {
         const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
         const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -530,22 +569,21 @@ namespace cv { namespace gpu { namespace imgproc
             float b = 0.f;
             float c = 0.f;
 
-            int offset1 = -(block_size / 2);
-            int offset2 = offset1 + block_size;
-
-            unsigned int j_begin = max(x + offset1, 0);
-            unsigned int i_begin = max(y + offset1, 0);
-            unsigned int j_end = min(x + offset2, cols);
-            unsigned int i_end = min(y + offset2, rows);
+            const int ibegin = y - (block_size / 2);
+            const int jbegin = x - (block_size / 2);
+            const int iend = ibegin + block_size;
+            const int jend = jbegin + block_size;
 
-            for (unsigned int i = i_begin; i < i_end; ++i)
+            for (int i = ibegin; i < iend; ++i)
             {
-                const float* dx_row = (const float*)Dx.ptr(i);
-                const float* dy_row = (const float*)Dy.ptr(i);
-                for (unsigned int j = j_begin; j < j_end; ++j)
+                int y = border_col.idx(i);
+                const float* dx_row = (const float*)Dx.ptr(y);
+                const float* dy_row = (const float*)Dy.ptr(y);
+                for (int j = jbegin; j < jend; ++j)
                 {
-                    float dx = dx_row[j];
-                    float dy = dy_row[j];
+                    int x = border_row.idx(j);
+                    float dx = dx_row[x];
+                    float dy = dy_row[x];
                     a += dx * dx;
                     b += dx * dy;
                     c += dy * dy;
@@ -558,7 +596,8 @@ namespace cv { namespace gpu { namespace imgproc
         }
     }
 
-    void cornerMinEigenVal_caller(const int block_size, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst)
+    void cornerMinEigenVal_caller(const int block_size, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst,
+                                  int border_type)
     {
         const int rows = Dx.rows;
         const int cols = Dx.cols;
@@ -566,7 +605,14 @@ namespace cv { namespace gpu { namespace imgproc
         dim3 threads(32, 8);
         dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 
-        cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, Dx, Dy, dst);
+        switch (border_type)
+        {
+        case BORDER_REFLECT101:
+            cornerMinEigenVal_kernel<<<grid, threads>>>(
+                    cols, rows, block_size, Dx, Dy, dst, 
+                    BrdReflect101(cols), BrdReflect101(rows));
+            break;
+        }
         cudaSafeCall(cudaThreadSynchronize());
     }
 }}}
diff --git a/modules/gpu/src/cuda/linear_filters_beta.cu b/modules/gpu/src/cuda/linear_filters_beta.cu
new file mode 100644
index 0000000..c951daf
--- /dev/null
+++ b/modules/gpu/src/cuda/linear_filters_beta.cu
@@ -0,0 +1,264 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/devmem2d.hpp"
+#include "safe_call.hpp"
+#include "cuda_shared.hpp"
+#include "border_interpolate.hpp"
+
+#define BLOCK_DIM_X 16
+#define BLOCK_DIM_Y 16
+#define MAX_KERNEL_SIZE 16
+
+using namespace cv::gpu;
+
+
+namespace cv { namespace gpu { namespace linear_filters {
+
+
+// Global linear kernel data storage
+__constant__ float ckernel[MAX_KERNEL_SIZE];
+
+
+void loadKernel(const float* kernel, int ksize) 
+{
+    cudaSafeCall(cudaMemcpyToSymbol(ckernel, kernel, ksize * sizeof(float)));
+}
+
+
+template <typename T, typename B, int ksize>
+__global__ void rowFilterKernel(const DevMem2D_<T> src, PtrStepf dst, 
+                                int anchor, B border)
+{
+    __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y * 3];
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float* srow = smem + threadIdx.y * blockDim.x * 3;
+
+    if (y < src.rows)
+    {
+        const T* src_row = src.ptr(y);
+
+        srow[threadIdx.x + blockDim.x] = border.at_high(x, src_row);
+
+        srow[threadIdx.x] = border.at_low(x - blockDim.x, src_row);
+
+        srow[threadIdx.x + (blockDim.x << 1)] = border.at_high(x + blockDim.x, src_row);
+
+        __syncthreads();
+
+        if (x < src.cols)
+        {
+            srow += threadIdx.x + blockDim.x - anchor;
+
+            float sum = 0.f;
+            for (int i = 0; i < ksize; ++i)
+                sum += srow[i] * ckernel[i];
+
+            dst.ptr(y)[x] = sum;
+        }
+    }
+}
+
+
+template <typename T, typename B, int ksize>
+void rowFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor)
+{
+    dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+    dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y));
+
+    B border(src.cols);
+
+    if (!border.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
+        cv::gpu::error("rowFilterCaller: can't use specified border extrapolation, image is too small, "
+                       "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
+
+    rowFilterKernel<T, B, ksize><<<grid, threads>>>(src, dst, anchor, border);
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+
+template <typename T, typename B>
+void rowFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor, 
+                     const float* kernel, int ksize)
+{
+    typedef void (*Caller)(const DevMem2D_<T>, PtrStepf, int);
+
+    static const Caller callers[] = 
+    { 
+        0, rowFilterCaller<T, B, 1>, 
+        rowFilterCaller<T, B, 2>, rowFilterCaller<T, B, 3>, 
+        rowFilterCaller<T, B, 4>, rowFilterCaller<T, B, 5>, 
+        rowFilterCaller<T, B, 6>, rowFilterCaller<T, B, 7>, 
+        rowFilterCaller<T, B, 8>, rowFilterCaller<T, B, 9>, 
+        rowFilterCaller<T, B, 10>, rowFilterCaller<T, B, 11>, 
+        rowFilterCaller<T, B, 12>, rowFilterCaller<T, B, 13>, 
+        rowFilterCaller<T, B, 14>, rowFilterCaller<T, B, 15> 
+    };
+
+    loadKernel(kernel, ksize);
+    callers[ksize](src, dst, anchor);
+}
+
+
+template <typename T>
+void rowFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor, 
+                     const float* kernel, int ksize, int brd_interp)
+{
+    typedef void (*Caller)(const DevMem2D_<T>, PtrStepf, int, const float*, int);
+
+    static const Caller callers[] = 
+    { 
+        rowFilterCaller<T, BrdRowReflect101<T> >
+    };
+
+    callers[brd_interp](src, dst, anchor, kernel, ksize);
+}
+
+
+template void rowFilterCaller<unsigned char>(const DevMem2D_<unsigned char>, PtrStepf, int, const float*, int, int);
+template void rowFilterCaller<float>(const DevMem2D_<float>, PtrStepf, int, const float*, int, int);
+
+
+template <typename T, typename B, int ksize>
+__global__ void colFilterKernel(const DevMem2D_<T> src, PtrStepf dst, int anchor, B border)
+{
+    __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y * 3];
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const int smem_step = blockDim.x;
+
+    float* scol = smem + threadIdx.x;
+
+    if (x < src.cols)
+    {
+        const T* src_col = src.data + x;
+
+        scol[(threadIdx.y + blockDim.y) * smem_step] = border.at_high(y, src_col);
+
+        scol[threadIdx.y * smem_step] = border.at_low(y - blockDim.y, src_col);
+
+        scol[(threadIdx.y + (blockDim.y << 1)) * smem_step] = border.at_high(y + blockDim.y, src_col);
+
+        __syncthreads();
+
+        if (y < src.rows)
+        {
+            scol += (threadIdx.y + blockDim.y - anchor)* smem_step;
+
+            float sum = 0.f;
+            for(int i = 0; i < ksize; ++i)
+                sum += scol[i * smem_step] * ckernel[i];
+
+            dst.ptr(y)[x] = sum;
+        }
+    }
+}
+
+
+template <typename T, typename B, int ksize>
+void colFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor)
+{
+    dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+    dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y));
+
+    B border(src.rows, src.step / src.elem_size);
+
+    if (src.step - border.step * src.elem_size != 0)
+        cv::gpu::error("colFilterCaller: src step must be multiple of its element size", 
+                       __FILE__, __LINE__);
+
+    if (!border.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
+        cv::gpu::error("colFilterCaller: can't use specified border extrapolation, image is too small, "
+                       "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
+
+    colFilterKernel<T, B, ksize><<<grid, threads>>>(src, dst, anchor, border);
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+
+template <typename T, typename B>
+void colFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor, 
+                     const float* kernel, int ksize)
+{
+    typedef void (*Caller)(const DevMem2D_<T>, PtrStepf, int);
+
+    static const Caller callers[] = 
+    { 
+        0, colFilterCaller<T, B, 1>, 
+        colFilterCaller<T, B, 2>, colFilterCaller<T, B, 3>, 
+        colFilterCaller<T, B, 4>, colFilterCaller<T, B, 5>, 
+        colFilterCaller<T, B, 6>, colFilterCaller<T, B, 7>, 
+        colFilterCaller<T, B, 8>, colFilterCaller<T, B, 9>, 
+        colFilterCaller<T, B, 10>, colFilterCaller<T, B, 11>, 
+        colFilterCaller<T, B, 12>, colFilterCaller<T, B, 13>, 
+        colFilterCaller<T, B, 14>, colFilterCaller<T, B, 15> 
+    };
+
+    loadKernel(kernel, ksize);
+    callers[ksize](src, dst, anchor);
+}
+
+
+template <typename T>
+void colFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor, 
+                     const float* kernel, int ksize, int brd_interp)
+{
+    typedef void (*Caller)(const DevMem2D_<T>, PtrStepf, int, const float*, int);
+
+    static const Caller callers[] = 
+    { 
+        colFilterCaller<T, BrdColReflect101<T> >
+    };
+
+    callers[brd_interp](src, dst, anchor, kernel, ksize);
+}
+
+
+template void colFilterCaller<unsigned char>(const DevMem2D_<unsigned char>, PtrStepf, int, const float*, int, int);
+template void colFilterCaller<float>(const DevMem2D_<float>, PtrStepf, int, const float*, int, int);
+
+}}} 
\ No newline at end of file
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 26a464e..9becef0 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -68,8 +68,8 @@ void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
 void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*) { throw_nogpu(); }
 void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu(); }
-void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double) { throw_nogpu(); }
-void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int) { throw_nogpu(); }
+void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
+void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
 
 
 #else /* !defined (HAVE_CUDA) */
@@ -861,49 +861,119 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
 
 namespace cv { namespace gpu { namespace imgproc {
 
-    void cornerHarris_caller(const int block_size, const float k, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst);
-    void cornerMinEigenVal_caller(const int block_size, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst);
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst);
+    void cornerHarris_caller(const int block_size, const float k, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst, int border_type);
+    void cornerMinEigenVal_caller(const int block_size, const DevMem2D Dx, const DevMem2D Dy, DevMem2D dst, int border_type);
+
+}}}
+
+namespace cv { namespace gpu { namespace linear_filters {
+
+    template <typename T>
+    void rowFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor, const float* kernel, 
+                         int ksize, int brd_interp);
+
+    template <typename T>
+    void colFilterCaller(const DevMem2D_<T> src, PtrStepf dst, int anchor, const float* kernel, 
+                         int ksize, int brd_interp);
 
 }}}
 
 namespace 
 {
-    void computeGradients(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize)
-    {
-        CV_Assert(src.type() == CV_32F);
-
+    template <typename T>
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int gpuBorderType)
+    {   
         double scale = (double)(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
-        if (ksize < 0) scale *= 2.;
+        if (ksize < 0) 
+            scale *= 2.;
+        if (src.depth() == CV_8U)
+            scale *= 255.;
         scale = 1./scale;
 
-        if (ksize > 0)
-        {
-            Sobel(src, Dx, CV_32F, 1, 0, ksize, scale);
-            Sobel(src, Dy, CV_32F, 0, 1, ksize, scale);
-        }
-        else
+        GpuMat tmp_buf(src.size(), CV_32F);
+        Dx.create(src.size(), CV_32F);
+        Dy.create(src.size(), CV_32F);
+        Mat kx, ky;
+
+        getDerivKernels(kx, ky, 1, 0, ksize, false, CV_32F);
+        kx = kx.reshape(1, 1) * scale;
+        ky = ky.reshape(1, 1);
+
+        linear_filters::rowFilterCaller<T>(
+                src, tmp_buf, kx.cols >> 1, kx.ptr<float>(0), kx.cols,
+                gpuBorderType);
+
+        linear_filters::colFilterCaller<float>(
+                tmp_buf, Dx, ky.cols >> 1, ky.ptr<float>(0), ky.cols, 
+                gpuBorderType);
+
+        getDerivKernels(kx, ky, 0, 1, ksize, false, CV_32F);
+        kx = kx.reshape(1, 1);
+        ky = ky.reshape(1, 1) * scale;
+
+        linear_filters::rowFilterCaller<T>(
+                src, tmp_buf, kx.cols >> 1, kx.ptr<float>(0), kx.cols, 
+                gpuBorderType);
+
+        linear_filters::colFilterCaller<float>(
+                tmp_buf, Dy, ky.cols >> 1, ky.ptr<float>(0), ky.cols, 
+                gpuBorderType);
+    }
+
+    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int gpuBorderType)
+    {
+        switch (src.type())
         {
-            Scharr(src, Dx, CV_32F, 1, 0, scale);
-            Scharr(src, Dy, CV_32F, 0, 1, scale);
+        case CV_8U:
+            extractCovData<unsigned char>(src, Dx, Dy, blockSize, ksize, gpuBorderType);
+            break;
+        case CV_32F:
+            extractCovData<float>(src, Dx, Dy, blockSize, ksize, gpuBorderType);
+            break;
+        default:
+            CV_Error(CV_StsBadArg, "extractCovData: unsupported type of the source matrix");
         }
     }
-}
 
-void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k)
+} // Anonymous namespace
+
+void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType)
 {
+    int gpuBorderType;
+    switch (borderType)
+    {
+    case cv::BORDER_REFLECT101:
+        gpuBorderType = cv::gpu::BORDER_REFLECT101;
+        break;
+    default:
+        CV_Error(CV_StsBadArg, "cornerHarris: unsupported border type");
+    }
+
     GpuMat Dx, Dy;
-    computeGradients(src, Dx, Dy, blockSize, ksize);
+    extractCovData(src, Dx, Dy, blockSize, ksize, gpuBorderType);
     dst.create(src.size(), CV_32F);
-    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst);
+    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType);
 }
 
-void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize)
-{
+void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
+{  
+    int gpuBorderType;
+    switch (borderType)
+    {
+    case cv::BORDER_REFLECT101:
+        gpuBorderType = cv::gpu::BORDER_REFLECT101;
+        break;
+    default:
+        CV_Error(CV_StsBadArg, "cornerMinEigenVal: unsupported border type");
+    }
+
     GpuMat Dx, Dy;
-    computeGradients(src, Dx, Dy, blockSize, ksize);
+    extractCovData(src, Dx, Dy, blockSize, ksize, gpuBorderType);    
     dst.create(src.size(), CV_32F);
-    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst);
+    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType);
 }
 
 
 #endif /* !defined (HAVE_CUDA) */
+
diff --git a/tests/gpu/src/imgproc_gpu.cpp b/tests/gpu/src/imgproc_gpu.cpp
index f621782..ead568a 100644
--- a/tests/gpu/src/imgproc_gpu.cpp
+++ b/tests/gpu/src/imgproc_gpu.cpp
@@ -616,9 +616,11 @@ struct CV_GpuCornerHarrisTest: CvTest
         {
             for (int i = 0; i < 5; ++i)
             {
-                int rows = 10 + rand() % 300, cols = 10 + rand() % 300;
+                int rows = 25 + rand() % 300, cols = 25 + rand() % 300;
                 if (!compareToCpuTest(rows, cols, CV_32F, 1 + rand() % 5, 1 + 2 * (rand() % 4))) return;
                 if (!compareToCpuTest(rows, cols, CV_32F, 1 + rand() % 5, -1)) return;
+                if (!compareToCpuTest(rows, cols, CV_8U, 1 + rand() % 5, 1 + 2 * (rand() % 4))) return;
+                if (!compareToCpuTest(rows, cols, CV_8U, 1 + rand() % 5, -1)) return;
             }
         }
         catch (const Exception& e)
@@ -634,22 +636,22 @@ struct CV_GpuCornerHarrisTest: CvTest
         cv::Mat src(rows, cols, depth);
         if (depth == CV_32F) 
             rng.fill(src, RNG::UNIFORM, cv::Scalar(0), cv::Scalar(1));
+        else if (depth == CV_8U)
+            rng.fill(src, RNG::UNIFORM, cv::Scalar(0), cv::Scalar(256));
 
         double k = 0.1;
-        int borderType = BORDER_DEFAULT;
+        int borderType = BORDER_REFLECT101;
 
         cv::Mat dst_gold;
         cv::cornerHarris(src, dst_gold, blockSize, apertureSize, k, borderType); 
 
         cv::gpu::GpuMat dst;
-        cv::gpu::cornerHarris(cv::gpu::GpuMat(src), dst, blockSize, apertureSize, k);
-
-        int asize = apertureSize > 0 ? apertureSize : 3;
+        cv::gpu::cornerHarris(cv::gpu::GpuMat(src), dst, blockSize, apertureSize, k, borderType);
 
         cv::Mat dsth = dst;
-        for (int i = max(blockSize, asize) + 2; i < dst.rows - max(blockSize, asize) - 2; ++i)
+        for (int i = 0; i < dst.rows; ++i)
         {
-            for (int j = max(blockSize, asize) + 2; j < dst.cols - max(blockSize, asize) - 2; ++j)
+            for (int j = 0; j < dst.cols; ++j)
             {
                 float a = dst_gold.at<float>(i, j);
                 float b = dsth.at<float>(i, j);
@@ -678,9 +680,11 @@ struct CV_GpuCornerMinEigenValTest: CvTest
         {
             for (int i = 0; i < 3; ++i)
             {
-                int rows = 10 + rand() % 300, cols = 10 + rand() % 300;
+                int rows = 25 + rand() % 300, cols = 25 + rand() % 300;
                 if (!compareToCpuTest(rows, cols, CV_32F, 1 + rand() % 5, -1)) return;
                 if (!compareToCpuTest(rows, cols, CV_32F, 1 + rand() % 5, 1 + 2 * (rand() % 4))) return;
+                if (!compareToCpuTest(rows, cols, CV_8U, 1 + rand() % 5, -1)) return;
+                if (!compareToCpuTest(rows, cols, CV_8U, 1 + rand() % 5, 1 + 2 * (rand() % 4))) return;
             }
         }
         catch (const Exception& e)
@@ -696,25 +700,25 @@ struct CV_GpuCornerMinEigenValTest: CvTest
         cv::Mat src(rows, cols, depth);
         if (depth == CV_32F) 
             rng.fill(src, RNG::UNIFORM, cv::Scalar(0), cv::Scalar(1));
+        else if (depth == CV_8U)
+            rng.fill(src, RNG::UNIFORM, cv::Scalar(0), cv::Scalar(256));
 
-        int borderType = BORDER_DEFAULT;
+        int borderType = BORDER_REFLECT101;
 
         cv::Mat dst_gold;
         cv::cornerMinEigenVal(src, dst_gold, blockSize, apertureSize, borderType); 
 
         cv::gpu::GpuMat dst;
-        cv::gpu::cornerMinEigenVal(cv::gpu::GpuMat(src), dst, blockSize, apertureSize);
-
-        int asize = apertureSize > 0 ? apertureSize : 3;
+        cv::gpu::cornerMinEigenVal(cv::gpu::GpuMat(src), dst, blockSize, apertureSize, borderType);      
 
         cv::Mat dsth = dst;
-        for (int i = max(blockSize, asize) + 2; i < dst.rows - max(blockSize, asize) - 2; ++i)
+        for (int i = 0; i < dst.rows; ++i)
         {
-            for (int j = max(blockSize, asize) + 2; j < dst.cols - max(blockSize, asize) - 2; ++j)
+            for (int j = 0; j < dst.cols; ++j)
             {
                 float a = dst_gold.at<float>(i, j);
                 float b = dsth.at<float>(i, j);
-                if (fabs(a - b) > 1e-3f) 
+                if (fabs(a - b) > 1e-2f) 
                 {
                     ts->printf(CvTS::CONSOLE, "%d %d %f %f %d %d\n", i, j, a, b, apertureSize, blockSize);
                     ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
-- 
2.7.4